encoding_estimator 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/.gitlab-ci.yml +2 -2
- data/README.md +3 -0
- data/bin/encest-detect +1 -1
- data/bin/encest-gen +1 -1
- data/encoding_estimator.gemspec +3 -0
- data/lib/encoding_estimator.rb +2 -3
- data/lib/encoding_estimator/builder/model_builder.rb +1 -1
- data/lib/encoding_estimator/builder/parallel_model_builder.rb +1 -1
- data/lib/encoding_estimator/conversion.rb +7 -0
- data/lib/encoding_estimator/distribution.rb +1 -1
- data/lib/encoding_estimator/language_model.rb +3 -3
- data/lib/encoding_estimator/version.rb +1 -1
- data/tutorials/interactive.md +34 -0
- data/tutorials/noninteractive.md +34 -0
- data/tutorials/tutorial.md +6 -0
- metadata +47 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f753dae50ff0610f5a06eb7e13826de852c094a2
|
4
|
+
data.tar.gz: 2fd62584bc93ef4fbe2233cda5c862dd0c85a02c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fc5f8224a9f4ab7b088328036d626ffe45828a3b2b024800f768f183674b49caa2bfb51a4bf8d6a7a7f6f601136c72f6b16a50ec612c130e09075e908067138f
|
7
|
+
data.tar.gz: c4e2d79b8e3e7f70b7fe705755e99f43b1ab27f4470456adf535ef0abcc8ab8960cd6183615c322747eb7f4641dc62ba52795cd9860db04867850f0e051dce0a
|
data/.gitignore
CHANGED
data/.gitlab-ci.yml
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# EncodingEstimator: Detect encoding of strings
|
2
2
|
|
3
3
|
![Build Status](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/build.svg)
|
4
|
+
![Code Covergae](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/coverage.svg)
|
4
5
|
|
5
6
|
This gem allows you to detect the encoding of strings/files based on their content. This can be useful if you need to load data from sources with unknown encodings. The gem uses character distribution statistics to check which encoding is the one that gives you the best results.
|
6
7
|
|
@@ -19,6 +20,8 @@ The second one is a shortcut you can use in case you just want to be sure to get
|
|
19
20
|
utf8_txt = EncodingEstimator.ensure_utf8( File.read( 'foo.txt' ), languages: [ :en, :de ] )
|
20
21
|
```
|
21
22
|
|
23
|
+
More detailed tutorials can be found [here](./tutorials/tutorial.md).
|
24
|
+
|
22
25
|
If you need more control over the operations to perform, just have a look at `EncodingEstimator::Detector` and `EncodingEstimator::Conversion`.
|
23
26
|
|
24
27
|
|
data/bin/encest-detect
CHANGED
@@ -51,7 +51,7 @@ opts[:threads] = opts[:threads] == 0 ? nil : opts[:threads]
|
|
51
51
|
|
52
52
|
# Process every file
|
53
53
|
opts.arguments.each do |file|
|
54
|
-
detection = EncodingEstimator.detect File.read(file ), {
|
54
|
+
detection = EncodingEstimator.detect File.read( file, encoding: 'utf-8' ), {
|
55
55
|
languages: opts[:languages], encodings: opts[:encodings],
|
56
56
|
operations: opts[:operations], include_default: true,
|
57
57
|
num_cores: opts[:threads]
|
data/bin/encest-gen
CHANGED
@@ -101,5 +101,5 @@ configurations.each do |config|
|
|
101
101
|
runner.execute!( opts[:threads], !silent )
|
102
102
|
|
103
103
|
# Save the model as json
|
104
|
-
File.open("#{config.language}.json", 'w') { |f| f.write JSON.unparse(runner.results) }
|
104
|
+
File.open("#{config.language}.json", 'w:utf-8') { |f| f.write JSON.unparse(runner.results) }
|
105
105
|
end
|
data/encoding_estimator.gemspec
CHANGED
@@ -26,6 +26,9 @@ Gem::Specification.new do |spec|
|
|
26
26
|
|
27
27
|
spec.add_development_dependency 'bundler'
|
28
28
|
spec.add_development_dependency 'minitest'
|
29
|
+
spec.add_development_dependency 'parallel'
|
30
|
+
spec.add_development_dependency 'simplecov'
|
31
|
+
spec.add_development_dependency 'simplecov-parallel'
|
29
32
|
|
30
33
|
spec.add_dependency 'htmlentities', '~> 4.3'
|
31
34
|
spec.add_dependency 'json', '~> 2.0'
|
data/lib/encoding_estimator.rb
CHANGED
@@ -28,7 +28,7 @@ module EncodingEstimator
|
|
28
28
|
operations: [Conversion::Operation::DECODE],
|
29
29
|
include_default: true,
|
30
30
|
penalty: 0.01,
|
31
|
-
num_cores: nil
|
31
|
+
num_cores: nil
|
32
32
|
}.merge config
|
33
33
|
|
34
34
|
EncodingEstimator.detect( data, params ).result.perform( data )
|
@@ -48,14 +48,13 @@ module EncodingEstimator
|
|
48
48
|
#
|
49
49
|
# @return [EncodingEstimator::Detection] Detection result with scores for all conversions
|
50
50
|
def EncodingEstimator.detect( data, config )
|
51
|
-
|
52
51
|
params = {
|
53
52
|
languages: [ :de, :en ],
|
54
53
|
encodings: %w(iso-8859-1 utf-16le windows-1251),
|
55
54
|
operations: [Conversion::Operation::DECODE],
|
56
55
|
include_default: true,
|
57
56
|
penalty: 0.01,
|
58
|
-
num_cores: nil
|
57
|
+
num_cores: nil
|
59
58
|
}.merge config
|
60
59
|
|
61
60
|
Detector.new(
|
@@ -59,7 +59,7 @@ module EncodingEstimator
|
|
59
59
|
#
|
60
60
|
# @return [String] Content of the file without whitespaces
|
61
61
|
def load_content
|
62
|
-
raw = File.read( @filename ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
|
62
|
+
raw = File.read( @filename, encoding: 'utf-8' ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
|
63
63
|
decoder = HTMLEntities.new
|
64
64
|
plaintext = decoder.decode raw
|
65
65
|
|
@@ -28,7 +28,7 @@ module EncodingEstimator
|
|
28
28
|
# @param [Boolean] show_progress if set to true and the ruby-progressbar gem is installed, show a progressbar
|
29
29
|
# @return [Hash] Character count statistics combined from all files of the directory, scaled linear
|
30
30
|
def execute!( max_processes = 4, show_progress = true )
|
31
|
-
if EncodingEstimator::ParallelSupport.supported?
|
31
|
+
if EncodingEstimator::ParallelSupport.supported? && !max_processes.nil?
|
32
32
|
opts = {
|
33
33
|
in_processes: max_processes,
|
34
34
|
progress: ( show_progress && EncodingEstimator::ParallelSupport.progress? ) ? 'Analyzing' : nil
|
@@ -58,6 +58,13 @@ module EncodingEstimator
|
|
58
58
|
@key ||= "#{operation}_#{encoding}"
|
59
59
|
end
|
60
60
|
|
61
|
+
# Get the default (utf-8) encoding conversion: does nothing when performing the conversion.
|
62
|
+
#
|
63
|
+
# @return [EncodingEstimator::Conversion] A conversion object representing the default conversion
|
64
|
+
def self.default
|
65
|
+
EncodingEstimator::Conversion.new
|
66
|
+
end
|
67
|
+
|
61
68
|
# Generate all conversions of for given encodings and operations. Note: this will produce
|
62
69
|
# #encodings * #operations conversions if default is not included and #encoding * #operations + 1
|
63
70
|
# if the default is included.
|
@@ -19,10 +19,10 @@ module EncodingEstimator
|
|
19
19
|
#
|
20
20
|
# @return [Boolean] true, if the referenced model file exists
|
21
21
|
def valid?
|
22
|
-
if
|
23
|
-
@language.to_s.size == 2 and File.file? internal_path
|
24
|
-
else
|
22
|
+
if external?
|
25
23
|
File.file? external_path
|
24
|
+
else
|
25
|
+
@language.to_s.size == 2 and File.file? internal_path
|
26
26
|
end
|
27
27
|
end
|
28
28
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Loading files of unknown encoding (interactive)
|
2
|
+
|
3
|
+
We continue using the code snippet from [the non-interactive loading](./noninteractive.md). But this time, we don't use `EncodingEstimator.ensure_utf8` to convert the string automatically, but we want to let the user decide, if the detection was correct. Therefore, the `EncodingEstimator.detect` method is very useful. It does not convert the input, but gives you a detection result. It looks like that:
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
|
7
|
+
detection = EncodingEstimator.detect( input, cfg )
|
8
|
+
|
9
|
+
puts detection.result.encoding # e.g. 'utf-8' or 'windows-1252'
|
10
|
+
```
|
11
|
+
|
12
|
+
The `detect` method returns an `EncodingEstimator::Detection` instance. It contains information on how likely which encoding is. Using `EncodingEstimator::Detection.result` you will get the most probable conversion represented as an `EncodingEstimator::Conversion` object. So let's check the encoding we just detected:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
require 'csv'
|
16
|
+
require 'encoding_estimator'
|
17
|
+
|
18
|
+
cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
|
19
|
+
content = File.read( ARGV[ 0 ], encoding: 'utf-8' )
|
20
|
+
detection = EncodingEstimator.detect( content, cfg )
|
21
|
+
|
22
|
+
# Not the default encoding?
|
23
|
+
unless detection.result.equals? EncodingEstimator::Conversion.default
|
24
|
+
puts "Detected encoding #{detection.result.encoding} on #{ARGV[0]}."
|
25
|
+
puts "Is this correct? (y/n)"
|
26
|
+
|
27
|
+
# If the user accepts, decode as the detected encoding
|
28
|
+
content = detection.result.perform content if STDIN.readline.strip == 'y'
|
29
|
+
end
|
30
|
+
|
31
|
+
CSV.parse( content ) do |row|
|
32
|
+
puts row[ 0 ]
|
33
|
+
end
|
34
|
+
```
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# Loading files of unknown encoding (non-interactive)
|
2
|
+
|
3
|
+
Let's say you have the following application which just reads a CSV file and prints the first line:
|
4
|
+
|
5
|
+
```ruby
|
6
|
+
require 'csv'
|
7
|
+
|
8
|
+
content = File.read( ARGV[ 0 ], encoding: 'utf-8' )
|
9
|
+
CSV.parse( content ) do |row|
|
10
|
+
puts row[ 0 ]
|
11
|
+
end
|
12
|
+
```
|
13
|
+
|
14
|
+
*Note: yes, you could use `CSV.read` but this is easier to follow for developers not familiar with the `CSV` class. And please, don't use that snippet in production as there is no error handling at all.*
|
15
|
+
|
16
|
+
|
17
|
+
So you want to ensure that the file you read is correctly encoded, because sometimes you may get these files in... let's say "interesting"... encodings, e.g. Windows-1252 in some Excel exports.
|
18
|
+
|
19
|
+
Assume that you know that your little tool gets files containing German text encoded either as Windows-1252 or UTF-8. To handle both encodings correctly, we change the tool:
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
require 'csv'
|
23
|
+
require 'encoding_estimator'
|
24
|
+
|
25
|
+
cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
|
26
|
+
content = EncodingEstimator.ensure_utf8( File.read( ARGV[ 0 ], encoding: 'utf-8' ), cfg )
|
27
|
+
|
28
|
+
CSV.parse( content ) do |row|
|
29
|
+
puts row[ 0 ]
|
30
|
+
end
|
31
|
+
```
|
32
|
+
Looking at the 4th line, you might think: wait, there's only Windows-1252 listed in the configuration. What about UTF-8? UTF-8 is included by default, you can disable it via `include_default: false` in the configuration.
|
33
|
+
|
34
|
+
The `EncodingEstimator.ensure_utf8` method gives you the input string encoded as whatever the gem detects as the best matching encoding.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: encoding_estimator
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Oskar Kirmis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,48 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: parallel
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplecov
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov-parallel
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
41
83
|
- !ruby/object:Gem::Dependency
|
42
84
|
name: htmlentities
|
43
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -123,6 +165,9 @@ files:
|
|
123
165
|
- lib/encoding_estimator/language_model.rb
|
124
166
|
- lib/encoding_estimator/parallel_support.rb
|
125
167
|
- lib/encoding_estimator/version.rb
|
168
|
+
- tutorials/interactive.md
|
169
|
+
- tutorials/noninteractive.md
|
170
|
+
- tutorials/tutorial.md
|
126
171
|
homepage: https://github.com/okirmis/encoding_estimator
|
127
172
|
licenses:
|
128
173
|
- MIT
|