encoding_estimator 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c877eb14be7a918d83a1bb6b38afc11506a86a09
4
- data.tar.gz: 39ae7ccee3d25c57ebd8c3895a631862d1ca942b
3
+ metadata.gz: f753dae50ff0610f5a06eb7e13826de852c094a2
4
+ data.tar.gz: 2fd62584bc93ef4fbe2233cda5c862dd0c85a02c
5
5
  SHA512:
6
- metadata.gz: 7c7e89167879b742b9395f8d342c4e7c9dfdc8c32ef169d798ddb64eddd3c736013d491cb0feac187ee65c275bf33eafa03d30246f81315b605ab9f37bc153db
7
- data.tar.gz: 4dff18e88f4a2551bdcc10b6f3d786d1636a275413eacf7e96ab8af007661e1753e517358d1f5d863d83bec877aefe0943aa0401ea2e6ff957bd277b915d0740
6
+ metadata.gz: fc5f8224a9f4ab7b088328036d626ffe45828a3b2b024800f768f183674b49caa2bfb51a4bf8d6a7a7f6f601136c72f6b16a50ec612c130e09075e908067138f
7
+ data.tar.gz: c4e2d79b8e3e7f70b7fe705755e99f43b1ab27f4470456adf535ef0abcc8ab8960cd6183615c322747eb7f4641dc62ba52795cd9860db04867850f0e051dce0a
data/.gitignore CHANGED
@@ -1,2 +1,4 @@
1
1
  Gemfile.lock
2
- .idea
2
+ .idea
3
+ coverage
4
+ doc
@@ -22,8 +22,8 @@ test:2.3:
22
22
  - bundle install && ruby test/detector_test.rb
23
23
 
24
24
 
25
- test:2.4-rc:
26
- image: ruby:2.4-rc
25
+ test:2.4:
26
+ image: ruby:2.4
27
27
  script:
28
28
  - bundle install && ruby test/detector_test.rb
29
29
 
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # EncodingEstimator: Detect encoding of strings
2
2
 
3
3
  ![Build Status](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/build.svg)
4
+ ![Code Covergae](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/coverage.svg)
4
5
 
5
6
  This gem allows you to detect the encoding of strings/files based on their content. This can be useful if you need to load data from sources with unknown encodings. The gem uses character distribution statistics to check which encoding is the one that gives you the best results.
6
7
 
@@ -19,6 +20,8 @@ The second one is a shortcut you can use in case you just want to be sure to get
19
20
  utf8_txt = EncodingEstimator.ensure_utf8( File.read( 'foo.txt' ), languages: [ :en, :de ] )
20
21
  ```
21
22
 
23
+ More detailed tutorials can be found [here](./tutorials/tutorial.md).
24
+
22
25
  If you need more control over the operations to perform, just have a look at `EncodingEstimator::Detector` and `EncodingEstimator::Conversion`.
23
26
 
24
27
 
@@ -51,7 +51,7 @@ opts[:threads] = opts[:threads] == 0 ? nil : opts[:threads]
51
51
 
52
52
  # Process every file
53
53
  opts.arguments.each do |file|
54
- detection = EncodingEstimator.detect File.read(file ), {
54
+ detection = EncodingEstimator.detect File.read( file, encoding: 'utf-8' ), {
55
55
  languages: opts[:languages], encodings: opts[:encodings],
56
56
  operations: opts[:operations], include_default: true,
57
57
  num_cores: opts[:threads]
@@ -101,5 +101,5 @@ configurations.each do |config|
101
101
  runner.execute!( opts[:threads], !silent )
102
102
 
103
103
  # Save the model as json
104
- File.open("#{config.language}.json", 'w') { |f| f.write JSON.unparse(runner.results) }
104
+ File.open("#{config.language}.json", 'w:utf-8') { |f| f.write JSON.unparse(runner.results) }
105
105
  end
@@ -26,6 +26,9 @@ Gem::Specification.new do |spec|
26
26
 
27
27
  spec.add_development_dependency 'bundler'
28
28
  spec.add_development_dependency 'minitest'
29
+ spec.add_development_dependency 'parallel'
30
+ spec.add_development_dependency 'simplecov'
31
+ spec.add_development_dependency 'simplecov-parallel'
29
32
 
30
33
  spec.add_dependency 'htmlentities', '~> 4.3'
31
34
  spec.add_dependency 'json', '~> 2.0'
@@ -28,7 +28,7 @@ module EncodingEstimator
28
28
  operations: [Conversion::Operation::DECODE],
29
29
  include_default: true,
30
30
  penalty: 0.01,
31
- num_cores: nil,
31
+ num_cores: nil
32
32
  }.merge config
33
33
 
34
34
  EncodingEstimator.detect( data, params ).result.perform( data )
@@ -48,14 +48,13 @@ module EncodingEstimator
48
48
  #
49
49
  # @return [EncodingEstimator::Detection] Detection result with scores for all conversions
50
50
  def EncodingEstimator.detect( data, config )
51
-
52
51
  params = {
53
52
  languages: [ :de, :en ],
54
53
  encodings: %w(iso-8859-1 utf-16le windows-1251),
55
54
  operations: [Conversion::Operation::DECODE],
56
55
  include_default: true,
57
56
  penalty: 0.01,
58
- num_cores: nil,
57
+ num_cores: nil
59
58
  }.merge config
60
59
 
61
60
  Detector.new(
@@ -59,7 +59,7 @@ module EncodingEstimator
59
59
  #
60
60
  # @return [String] Content of the file without whitespaces
61
61
  def load_content
62
- raw = File.read( @filename ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
62
+ raw = File.read( @filename, encoding: 'utf-8' ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
63
63
  decoder = HTMLEntities.new
64
64
  plaintext = decoder.decode raw
65
65
 
@@ -28,7 +28,7 @@ module EncodingEstimator
28
28
  # @param [Boolean] show_progress if set to true and the ruby-progressbar gem is installed, show a progressbar
29
29
  # @return [Hash] Character count statistics combined from all files of the directory, scaled linear
30
30
  def execute!( max_processes = 4, show_progress = true )
31
- if EncodingEstimator::ParallelSupport.supported?
31
+ if EncodingEstimator::ParallelSupport.supported? && !max_processes.nil?
32
32
  opts = {
33
33
  in_processes: max_processes,
34
34
  progress: ( show_progress && EncodingEstimator::ParallelSupport.progress? ) ? 'Analyzing' : nil
@@ -58,6 +58,13 @@ module EncodingEstimator
58
58
  @key ||= "#{operation}_#{encoding}"
59
59
  end
60
60
 
61
+ # Get the default (utf-8) encoding conversion: does nothing when performing the conversion.
62
+ #
63
+ # @return [EncodingEstimator::Conversion] A conversion object representing the default conversion
64
+ def self.default
65
+ EncodingEstimator::Conversion.new
66
+ end
67
+
61
68
  # Generate all conversions of for given encodings and operations. Note: this will produce
62
69
  # #encodings * #operations conversions if default is not included and #encoding * #operations + 1
63
70
  # if the default is included.
@@ -37,7 +37,7 @@ module EncodingEstimator
37
37
 
38
38
  begin
39
39
  distribution = JSON.parse(
40
- File.read( language.path )
40
+ File.read( language.path, encoding: 'utf-8' )
41
41
  )
42
42
  rescue Exception
43
43
  distribution = {}
@@ -19,10 +19,10 @@ module EncodingEstimator
19
19
  #
20
20
  # @return [Boolean] true, if the referenced model file exists
21
21
  def valid?
22
- if internal?
23
- @language.to_s.size == 2 and File.file? internal_path
24
- else
22
+ if external?
25
23
  File.file? external_path
24
+ else
25
+ @language.to_s.size == 2 and File.file? internal_path
26
26
  end
27
27
  end
28
28
 
@@ -1,3 +1,3 @@
1
1
  module EncodingEstimator
2
- VERSION = '0.1.2'
2
+ VERSION = '0.2.0'
3
3
  end
@@ -0,0 +1,34 @@
1
+ # Loading files of unknown encoding (interactive)
2
+
3
+ We continue using the code snippet from [the non-interactive loading](./noninteractive.md). But this time, we don't use `EncodingEstimator.ensure_utf8` to convert the string automatically, but we want to let the user decide, if the detection was correct. Therefore, the `EncodingEstimator.detect` method is very useful. It does not convert the input, but gives you a detection result. It looks like that:
4
+
5
+ ```ruby
6
+ cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
7
+ detection = EncodingEstimator.detect( input, cfg )
8
+
9
+ puts detection.result.encoding # e.g. 'utf-8' or 'windows-1252'
10
+ ```
11
+
12
+ The `detect` method returns an `EncodingEstimator::Detection` instance. It contains information on how likely which encoding is. Using `EncodingEstimator::Detection.result` you will get the most probable conversion represented as an `EncodingEstimator::Conversion` object. So let's check the encoding we just detected:
13
+
14
+ ```ruby
15
+ require 'csv'
16
+ require 'encoding_estimator'
17
+
18
+ cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
19
+ content = File.read( ARGV[ 0 ], encoding: 'utf-8' )
20
+ detection = EncodingEstimator.detect( content, cfg )
21
+
22
+ # Not the default encoding?
23
+ unless detection.result.equals? EncodingEstimator::Conversion.default
24
+ puts "Detected encoding #{detection.result.encoding} on #{ARGV[0]}."
25
+ puts "Is this correct? (y/n)"
26
+
27
+ # If the user accepts, decode as the detected encoding
28
+ content = detection.result.perform content if STDIN.readline.strip == 'y'
29
+ end
30
+
31
+ CSV.parse( content ) do |row|
32
+ puts row[ 0 ]
33
+ end
34
+ ```
@@ -0,0 +1,34 @@
1
+ # Loading files of unknown encoding (non-interactive)
2
+
3
+ Let's say you have the following application which just reads a CSV file and prints the first line:
4
+
5
+ ```ruby
6
+ require 'csv'
7
+
8
+ content = File.read( ARGV[ 0 ], encoding: 'utf-8' )
9
+ CSV.parse( content ) do |row|
10
+ puts row[ 0 ]
11
+ end
12
+ ```
13
+
14
+ *Note: yes, you could use `CSV.read` but this is easier to follow for developers not familiar with the `CSV` class. And please, don't use that snippet in production as there is no error handling at all.*
15
+
16
+
17
+ So you want to ensure that the file you read is correctly encoded, because sometimes you may get these files in... let's say "interesting"... encodings, e.g. Windows-1252 in some Excel exports.
18
+
19
+ Assume that you know that your little tool gets files containing German text encoded either as Windows-1252 or UTF-8. To handle both encodings correctly, we change the tool:
20
+
21
+ ```ruby
22
+ require 'csv'
23
+ require 'encoding_estimator'
24
+
25
+ cfg = { languages: [:de], encodings: [ 'windows-1252' ] }
26
+ content = EncodingEstimator.ensure_utf8( File.read( ARGV[ 0 ], encoding: 'utf-8' ), cfg )
27
+
28
+ CSV.parse( content ) do |row|
29
+ puts row[ 0 ]
30
+ end
31
+ ```
32
+ Looking at the 4th line, you might think: wait, there's only Windows-1252 listed in the configuration. What about UTF-8? UTF-8 is included by default, you can disable it via `include_default: false` in the configuration.
33
+
34
+ The `EncodingEstimator.ensure_utf8` method gives you the input string encoded as whatever the gem detects as the best matching encoding.
@@ -0,0 +1,6 @@
1
+ # Tutorials
2
+
3
+ Learn how to use this gem in real world scenarios.
4
+
5
+ * [Loading CSV files without user interaction](./noninteractive.md)
6
+ * [Loading CSV files with user interaction](./interactive.md)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: encoding_estimator
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Oskar Kirmis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-12-28 00:00:00.000000000 Z
11
+ date: 2017-01-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,48 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: parallel
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: simplecov
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov-parallel
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
41
83
  - !ruby/object:Gem::Dependency
42
84
  name: htmlentities
43
85
  requirement: !ruby/object:Gem::Requirement
@@ -123,6 +165,9 @@ files:
123
165
  - lib/encoding_estimator/language_model.rb
124
166
  - lib/encoding_estimator/parallel_support.rb
125
167
  - lib/encoding_estimator/version.rb
168
+ - tutorials/interactive.md
169
+ - tutorials/noninteractive.md
170
+ - tutorials/tutorial.md
126
171
  homepage: https://github.com/okirmis/encoding_estimator
127
172
  licenses:
128
173
  - MIT