encoding_estimator 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ require_relative 'encoding_estimator/version'
2
+
3
+ require_relative 'encoding_estimator/builder/parallel_model_builder'
4
+ require_relative 'encoding_estimator/detector'
5
+ require_relative 'encoding_estimator/language_model'
6
+
7
+ module EncodingEstimator
8
+
9
+ # Convert a string to a UTF-8 string by performing the conversion that
10
+ # is automatically detected by EncodingEstimator
11
+ #
12
+ # @param [String] data String to convert to UTF-8
13
+ # @param [Array<Symbol|String>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
14
+ # @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
15
+ # The order defines the priority when choosing from encodings with same detection score
16
+ # @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
17
+ # @param [Float] penalty Penalty threshold to define when chars are weighted negative
18
+ # @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
19
+ # @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
20
+ # already UTF-8 encoded
21
+ #
22
+ # @return [String] UTF-8 string
23
+ def EncodingEstimator.ensure_utf8( data, config = {} )
24
+
25
+ params = {
26
+ languages: [ :de, :en ],
27
+ encodings: %w(iso-8859-1 utf-16le windows-1251),
28
+ operations: [Conversion::Operation::DECODE],
29
+ include_default: true,
30
+ penalty: 0.01,
31
+ num_cores: nil,
32
+ }.merge config
33
+
34
+ EncodingEstimator.detect( data, params ).result.perform( data )
35
+ end
36
+
37
+ # Let the EncodingEstimator detect how the input string is encoded
38
+ #
39
+ # @param [String] data String to convert to UTF-8
40
+ # @param [Array<Symbol>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
41
+ # @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
42
+ # The order defines the priority when choosing from encodings with same detection score
43
+ # @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
44
+ # @param [Float] penalty Penalty threshold to define when chars are weighted negative
45
+ # @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
46
+ # @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
47
+ # already UTF-8 encoded
48
+ #
49
+ # @return [EncodingEstimator::Detection] Detection result with scores for all conversions
50
+ def EncodingEstimator.detect( data, config )
51
+
52
+ params = {
53
+ languages: [ :de, :en ],
54
+ encodings: %w(iso-8859-1 utf-16le windows-1251),
55
+ operations: [Conversion::Operation::DECODE],
56
+ include_default: true,
57
+ penalty: 0.01,
58
+ num_cores: nil,
59
+ }.merge config
60
+
61
+ Detector.new(
62
+ Conversion.generate( params[ :encodings ], params[ :operations ], params[ :include_default ] ),
63
+ params[ :languages ].map { |l| EncodingEstimator::LanguageModel.new( l ) }, params[ :penalty ], params[:num_cores]
64
+ ).detect data
65
+ end
66
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: encoding_estimator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Oskar Kirmis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-12-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: htmlentities
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '4.3'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '4.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: slop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.4'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.4'
83
+ description: This gem allows you to detect the encoding of a string based on their
84
+ content. It uses character distribution statistics to check which encoding is the
85
+ one that gives you the best results.
86
+ email:
87
+ - kirmis@st.ovgu.de
88
+ executables:
89
+ - encest-detect
90
+ - encest-gen
91
+ extensions: []
92
+ extra_rdoc_files: []
93
+ files:
94
+ - ".gitignore"
95
+ - CODE_OF_CONDUCT.md
96
+ - Gemfile
97
+ - LICENSE.txt
98
+ - README.md
99
+ - Rakefile
100
+ - bin/encest-detect
101
+ - bin/encest-gen
102
+ - encoding_estimator.gemspec
103
+ - lib/encoding_estimator.rb
104
+ - lib/encoding_estimator/builder/model_builder.rb
105
+ - lib/encoding_estimator/builder/parallel_model_builder.rb
106
+ - lib/encoding_estimator/conversion.rb
107
+ - lib/encoding_estimator/detection.rb
108
+ - lib/encoding_estimator/detector.rb
109
+ - lib/encoding_estimator/distribution.rb
110
+ - lib/encoding_estimator/lang/de.json
111
+ - lib/encoding_estimator/lang/en.json
112
+ - lib/encoding_estimator/lang/es.json
113
+ - lib/encoding_estimator/lang/fr.json
114
+ - lib/encoding_estimator/lang/ru.json
115
+ - lib/encoding_estimator/language_model.rb
116
+ - lib/encoding_estimator/parallel_support.rb
117
+ - lib/encoding_estimator/version.rb
118
+ homepage: https://git.iftrue.de/okirmis/encoding_estimator
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.6.8
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Detect encoding of an input string using character count statistics.
142
+ test_files: []