encoding_estimator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ require_relative 'encoding_estimator/version'
2
+
3
+ require_relative 'encoding_estimator/builder/parallel_model_builder'
4
+ require_relative 'encoding_estimator/detector'
5
+ require_relative 'encoding_estimator/language_model'
6
+
7
+ module EncodingEstimator
8
+
9
+ # Convert a string to a UTF-8 string by performing the conversion that
10
+ # is automatically detected by EncodingEstimator
11
+ #
12
+ # @param [String] data String to convert to UTF-8
13
+ # @param [Array<Symbol|String>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
14
+ # @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
15
+ # The order defines the priority when choosing from encodings with same detection score
16
+ # @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
17
+ # @param [Float] penalty Penalty threshold to define when chars are weighted negative
18
+ # @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
19
+ # @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
20
+ # already UTF-8 encoded
21
+ #
22
+ # @return [String] UTF-8 string
23
+ def EncodingEstimator.ensure_utf8( data, config = {} )
24
+
25
+ params = {
26
+ languages: [ :de, :en ],
27
+ encodings: %w(iso-8859-1 utf-16le windows-1251),
28
+ operations: [Conversion::Operation::DECODE],
29
+ include_default: true,
30
+ penalty: 0.01,
31
+ num_cores: nil,
32
+ }.merge config
33
+
34
+ EncodingEstimator.detect( data, params ).result.perform( data )
35
+ end
36
+
37
+ # Let the EncodingEstimator detect how the input string is encoded
38
+ #
39
+ # @param [String] data String to convert to UTF-8
40
+ # @param [Array<Symbol>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
41
+ # @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
42
+ # The order defines the priority when choosing from encodings with same detection score
43
+ # @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
44
+ # @param [Float] penalty Penalty threshold to define when chars are weighted negative
45
+ # @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
46
+ # @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
47
+ # already UTF-8 encoded
48
+ #
49
+ # @return [EncodingEstimator::Detection] Detection result with scores for all conversions
50
+ def EncodingEstimator.detect( data, config )
51
+
52
+ params = {
53
+ languages: [ :de, :en ],
54
+ encodings: %w(iso-8859-1 utf-16le windows-1251),
55
+ operations: [Conversion::Operation::DECODE],
56
+ include_default: true,
57
+ penalty: 0.01,
58
+ num_cores: nil,
59
+ }.merge config
60
+
61
+ Detector.new(
62
+ Conversion.generate( params[ :encodings ], params[ :operations ], params[ :include_default ] ),
63
+ params[ :languages ].map { |l| EncodingEstimator::LanguageModel.new( l ) }, params[ :penalty ], params[:num_cores]
64
+ ).detect data
65
+ end
66
+ end
metadata ADDED
@@ -0,0 +1,142 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: encoding_estimator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Oskar Kirmis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-12-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.13'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.13'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: htmlentities
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '4.3'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '4.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: slop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.4'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.4'
83
+ description: This gem allows you to detect the encoding of a string based on their
84
+ content. It uses character distribution statistics to check which encoding is the
85
+ one that gives you the best results.
86
+ email:
87
+ - kirmis@st.ovgu.de
88
+ executables:
89
+ - encest-detect
90
+ - encest-gen
91
+ extensions: []
92
+ extra_rdoc_files: []
93
+ files:
94
+ - ".gitignore"
95
+ - CODE_OF_CONDUCT.md
96
+ - Gemfile
97
+ - LICENSE.txt
98
+ - README.md
99
+ - Rakefile
100
+ - bin/encest-detect
101
+ - bin/encest-gen
102
+ - encoding_estimator.gemspec
103
+ - lib/encoding_estimator.rb
104
+ - lib/encoding_estimator/builder/model_builder.rb
105
+ - lib/encoding_estimator/builder/parallel_model_builder.rb
106
+ - lib/encoding_estimator/conversion.rb
107
+ - lib/encoding_estimator/detection.rb
108
+ - lib/encoding_estimator/detector.rb
109
+ - lib/encoding_estimator/distribution.rb
110
+ - lib/encoding_estimator/lang/de.json
111
+ - lib/encoding_estimator/lang/en.json
112
+ - lib/encoding_estimator/lang/es.json
113
+ - lib/encoding_estimator/lang/fr.json
114
+ - lib/encoding_estimator/lang/ru.json
115
+ - lib/encoding_estimator/language_model.rb
116
+ - lib/encoding_estimator/parallel_support.rb
117
+ - lib/encoding_estimator/version.rb
118
+ homepage: https://git.iftrue.de/okirmis/encoding_estimator
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.6.8
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Detect encoding of an input string using character count statistics.
142
+ test_files: []