encoding_estimator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +121 -0
- data/Rakefile +2 -0
- data/bin/encest-detect +62 -0
- data/bin/encest-gen +105 -0
- data/encoding_estimator.gemspec +31 -0
- data/lib/encoding_estimator/builder/model_builder.rb +70 -0
- data/lib/encoding_estimator/builder/parallel_model_builder.rb +45 -0
- data/lib/encoding_estimator/conversion.rb +108 -0
- data/lib/encoding_estimator/detection.rb +49 -0
- data/lib/encoding_estimator/detector.rb +156 -0
- data/lib/encoding_estimator/distribution.rb +49 -0
- data/lib/encoding_estimator/lang/de.json +1 -0
- data/lib/encoding_estimator/lang/en.json +1 -0
- data/lib/encoding_estimator/lang/es.json +1 -0
- data/lib/encoding_estimator/lang/fr.json +1 -0
- data/lib/encoding_estimator/lang/ru.json +1 -0
- data/lib/encoding_estimator/language_model.rb +73 -0
- data/lib/encoding_estimator/parallel_support.rb +49 -0
- data/lib/encoding_estimator/version.rb +3 -0
- data/lib/encoding_estimator.rb +66 -0
- metadata +142 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
require_relative 'encoding_estimator/version'
|
2
|
+
|
3
|
+
require_relative 'encoding_estimator/builder/parallel_model_builder'
|
4
|
+
require_relative 'encoding_estimator/detector'
|
5
|
+
require_relative 'encoding_estimator/language_model'
|
6
|
+
|
7
|
+
module EncodingEstimator
|
8
|
+
|
9
|
+
# Convert a string to a UTF-8 string by performing the conversion that
|
10
|
+
# is automatically detected by EncodingEstimator
|
11
|
+
#
|
12
|
+
# @param [String] data String to convert to UTF-8
|
13
|
+
# @param [Array<Symbol|String>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
|
14
|
+
# @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
|
15
|
+
# The order defines the priority when choosing from encodings with same detection score
|
16
|
+
# @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
|
17
|
+
# @param [Float] penalty Penalty threshold to define when chars are weighted negative
|
18
|
+
# @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
|
19
|
+
# @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
|
20
|
+
# already UTF-8 encoded
|
21
|
+
#
|
22
|
+
# @return [String] UTF-8 string
|
23
|
+
def EncodingEstimator.ensure_utf8( data, config = {} )
|
24
|
+
|
25
|
+
params = {
|
26
|
+
languages: [ :de, :en ],
|
27
|
+
encodings: %w(iso-8859-1 utf-16le windows-1251),
|
28
|
+
operations: [Conversion::Operation::DECODE],
|
29
|
+
include_default: true,
|
30
|
+
penalty: 0.01,
|
31
|
+
num_cores: nil,
|
32
|
+
}.merge config
|
33
|
+
|
34
|
+
EncodingEstimator.detect( data, params ).result.perform( data )
|
35
|
+
end
|
36
|
+
|
37
|
+
# Let the EncodingEstimator detect how the input string is encoded
|
38
|
+
#
|
39
|
+
# @param [String] data String to convert to UTF-8
|
40
|
+
# @param [Array<Symbol>] languages List of languages the data might originate from, two-letter-codes, e.g. [:de, :en]
|
41
|
+
# @param [Array<String>] encodings List of encodings to test, e.g. [ 'UTF-8', 'ISO-8859-1' ].
|
42
|
+
# The order defines the priority when choosing from encodings with same detection score
|
43
|
+
# @param [Array<Symbol>] operations Choose which operations (encoding to/decoding from an encoding to UTF-8) to test
|
44
|
+
# @param [Float] penalty Penalty threshold to define when chars are weighted negative
|
45
|
+
# @param [Integer] num_cores Number of threads to use for detection. Use "nil" to use single threaded implementation
|
46
|
+
# @param [Boolean] include_default Include "keep as is" conversion when testing, e.g. check if the string is
|
47
|
+
# already UTF-8 encoded
|
48
|
+
#
|
49
|
+
# @return [EncodingEstimator::Detection] Detection result with scores for all conversions
|
50
|
+
def EncodingEstimator.detect( data, config )
|
51
|
+
|
52
|
+
params = {
|
53
|
+
languages: [ :de, :en ],
|
54
|
+
encodings: %w(iso-8859-1 utf-16le windows-1251),
|
55
|
+
operations: [Conversion::Operation::DECODE],
|
56
|
+
include_default: true,
|
57
|
+
penalty: 0.01,
|
58
|
+
num_cores: nil,
|
59
|
+
}.merge config
|
60
|
+
|
61
|
+
Detector.new(
|
62
|
+
Conversion.generate( params[ :encodings ], params[ :operations ], params[ :include_default ] ),
|
63
|
+
params[ :languages ].map { |l| EncodingEstimator::LanguageModel.new( l ) }, params[ :penalty ], params[:num_cores]
|
64
|
+
).detect data
|
65
|
+
end
|
66
|
+
end
|
metadata
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: encoding_estimator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Oskar Kirmis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-12-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.13'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.13'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: htmlentities
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '4.3'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '4.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: json
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: slop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '4.4'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '4.4'
|
83
|
+
description: This gem allows you to detect the encoding of a string based on their
|
84
|
+
content. It uses character distribution statistics to check which encoding is the
|
85
|
+
one that gives you the best results.
|
86
|
+
email:
|
87
|
+
- kirmis@st.ovgu.de
|
88
|
+
executables:
|
89
|
+
- encest-detect
|
90
|
+
- encest-gen
|
91
|
+
extensions: []
|
92
|
+
extra_rdoc_files: []
|
93
|
+
files:
|
94
|
+
- ".gitignore"
|
95
|
+
- CODE_OF_CONDUCT.md
|
96
|
+
- Gemfile
|
97
|
+
- LICENSE.txt
|
98
|
+
- README.md
|
99
|
+
- Rakefile
|
100
|
+
- bin/encest-detect
|
101
|
+
- bin/encest-gen
|
102
|
+
- encoding_estimator.gemspec
|
103
|
+
- lib/encoding_estimator.rb
|
104
|
+
- lib/encoding_estimator/builder/model_builder.rb
|
105
|
+
- lib/encoding_estimator/builder/parallel_model_builder.rb
|
106
|
+
- lib/encoding_estimator/conversion.rb
|
107
|
+
- lib/encoding_estimator/detection.rb
|
108
|
+
- lib/encoding_estimator/detector.rb
|
109
|
+
- lib/encoding_estimator/distribution.rb
|
110
|
+
- lib/encoding_estimator/lang/de.json
|
111
|
+
- lib/encoding_estimator/lang/en.json
|
112
|
+
- lib/encoding_estimator/lang/es.json
|
113
|
+
- lib/encoding_estimator/lang/fr.json
|
114
|
+
- lib/encoding_estimator/lang/ru.json
|
115
|
+
- lib/encoding_estimator/language_model.rb
|
116
|
+
- lib/encoding_estimator/parallel_support.rb
|
117
|
+
- lib/encoding_estimator/version.rb
|
118
|
+
homepage: https://git.iftrue.de/okirmis/encoding_estimator
|
119
|
+
licenses:
|
120
|
+
- MIT
|
121
|
+
metadata: {}
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.6.8
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: Detect encoding of an input string using character count statistics.
|
142
|
+
test_files: []
|