RubyGems - encoding_estimator - Versions diffs - 0.1.2 → 0.2.0 - Mend

encoding_estimator 0.1.2 → 0.2.0

Files changed (18) hide show

checksums.yaml +4 -4
data/.gitignore +3 -1
data/.gitlab-ci.yml +2 -2
data/README.md +3 -0
data/bin/encest-detect +1 -1
data/bin/encest-gen +1 -1
data/encoding_estimator.gemspec +3 -0
data/lib/encoding_estimator.rb +2 -3
data/lib/encoding_estimator/builder/model_builder.rb +1 -1
data/lib/encoding_estimator/builder/parallel_model_builder.rb +1 -1
data/lib/encoding_estimator/conversion.rb +7 -0
data/lib/encoding_estimator/distribution.rb +1 -1
data/lib/encoding_estimator/language_model.rb +3 -3
data/lib/encoding_estimator/version.rb +1 -1
data/tutorials/interactive.md +34 -0
data/tutorials/noninteractive.md +34 -0
data/tutorials/tutorial.md +6 -0
metadata +47 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c877eb14be7a918d83a1bb6b38afc11506a86a09
-  data.tar.gz: 39ae7ccee3d25c57ebd8c3895a631862d1ca942b
+  metadata.gz: f753dae50ff0610f5a06eb7e13826de852c094a2
+  data.tar.gz: 2fd62584bc93ef4fbe2233cda5c862dd0c85a02c
 SHA512:
-  metadata.gz: 7c7e89167879b742b9395f8d342c4e7c9dfdc8c32ef169d798ddb64eddd3c736013d491cb0feac187ee65c275bf33eafa03d30246f81315b605ab9f37bc153db
-  data.tar.gz: 4dff18e88f4a2551bdcc10b6f3d786d1636a275413eacf7e96ab8af007661e1753e517358d1f5d863d83bec877aefe0943aa0401ea2e6ff957bd277b915d0740
+  metadata.gz: fc5f8224a9f4ab7b088328036d626ffe45828a3b2b024800f768f183674b49caa2bfb51a4bf8d6a7a7f6f601136c72f6b16a50ec612c130e09075e908067138f
+  data.tar.gz: c4e2d79b8e3e7f70b7fe705755e99f43b1ab27f4470456adf535ef0abcc8ab8960cd6183615c322747eb7f4641dc62ba52795cd9860db04867850f0e051dce0a

data/.gitignore CHANGED

@@ -1,2 +1,4 @@
 Gemfile.lock
-.idea
+.idea
+coverage
+doc

data/.gitlab-ci.yml CHANGED

@@ -22,8 +22,8 @@ test:2.3:
     - bundle install && ruby test/detector_test.rb
-test:2.4-rc:
-  image: ruby:2.4-rc
+test:2.4:
+  image: ruby:2.4
   script:
     - bundle install && ruby test/detector_test.rb

data/README.md CHANGED

@@ -1,6 +1,7 @@
 # EncodingEstimator: Detect encoding of strings
 ![Build Status](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/build.svg)
+![Code Covergae](https://git.iftrue.de/okirmis/encoding_estimator/badges/master/coverage.svg)
 This gem allows you to detect the encoding of strings/files based on their content. This can be useful if you need to load data from sources with unknown encodings. The gem uses character distribution statistics to check which encoding is the one that gives you the best results.
@@ -19,6 +20,8 @@ The second one is a shortcut you can use in case you just want to be sure to get
 utf8_txt = EncodingEstimator.ensure_utf8( File.read( 'foo.txt' ), languages: [ :en, :de ] )
 ```
+More detailed tutorials can be found [here](./tutorials/tutorial.md).
 If you need more control over the operations to perform, just have a look at `EncodingEstimator::Detector` and `EncodingEstimator::Conversion`.

data/bin/encest-detect CHANGED

@@ -51,7 +51,7 @@ opts[:threads] = opts[:threads] == 0 ? nil : opts[:threads]
 # Process every file
 opts.arguments.each do |file|
-  detection = EncodingEstimator.detect File.read(file ), {
+  detection = EncodingEstimator.detect File.read( file, encoding: 'utf-8' ), {
                     languages:  opts[:languages],  encodings: opts[:encodings],
                     operations: opts[:operations], include_default: true,
                     num_cores:  opts[:threads]

data/bin/encest-gen CHANGED

@@ -101,5 +101,5 @@ configurations.each do |config|
   runner.execute!( opts[:threads], !silent )
   # Save the model as json
-  File.open("#{config.language}.json", 'w') { |f| f.write JSON.unparse(runner.results) }
+  File.open("#{config.language}.json", 'w:utf-8') { |f| f.write JSON.unparse(runner.results) }
 end

data/encoding_estimator.gemspec CHANGED

@@ -26,6 +26,9 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency 'bundler'
   spec.add_development_dependency 'minitest'
+  spec.add_development_dependency 'parallel'
+  spec.add_development_dependency 'simplecov'
+  spec.add_development_dependency 'simplecov-parallel'
   spec.add_dependency 'htmlentities', '~> 4.3'
   spec.add_dependency 'json', '~> 2.0'

data/lib/encoding_estimator.rb CHANGED

@@ -28,7 +28,7 @@ module EncodingEstimator
       operations:       [Conversion::Operation::DECODE],
       include_default:  true,
       penalty:          0.01,
-      num_cores:        nil,
+      num_cores:        nil
     }.merge config
     EncodingEstimator.detect( data, params ).result.perform( data )
@@ -48,14 +48,13 @@ module EncodingEstimator
   #
   # @return [EncodingEstimator::Detection] Detection result with scores for all conversions
   def EncodingEstimator.detect( data, config )
     params = {
         languages:       [ :de, :en ],
         encodings:       %w(iso-8859-1 utf-16le windows-1251),
         operations:      [Conversion::Operation::DECODE],
         include_default: true,
         penalty:         0.01,
-        num_cores:       nil,
+        num_cores:       nil
     }.merge config
     Detector.new(

data/lib/encoding_estimator/builder/model_builder.rb CHANGED

@@ -59,7 +59,7 @@ module EncodingEstimator
     #
     # @return [String] Content of the file without whitespaces
     def load_content
-      raw       = File.read( @filename ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
+      raw       = File.read( @filename, encoding: 'utf-8' ).encode('UTF-16be', invalid: :replace, replace: '').encode('UTF-8')
       decoder   = HTMLEntities.new
       plaintext = decoder.decode raw

data/lib/encoding_estimator/builder/parallel_model_builder.rb CHANGED

@@ -28,7 +28,7 @@ module EncodingEstimator
     # @param [Boolean] show_progress if set to true and the ruby-progressbar gem is installed, show a progressbar
     # @return [Hash] Character count statistics combined from all files of the directory, scaled linear
     def execute!( max_processes = 4, show_progress = true )
-      if EncodingEstimator::ParallelSupport.supported?
+      if EncodingEstimator::ParallelSupport.supported? && !max_processes.nil?
         opts = {
             in_processes: max_processes,
             progress: ( show_progress && EncodingEstimator::ParallelSupport.progress? ) ? 'Analyzing' : nil

data/lib/encoding_estimator/conversion.rb CHANGED

@@ -58,6 +58,13 @@ module EncodingEstimator
       @key ||= "#{operation}_#{encoding}"
     end
+    # Get the default (utf-8) encoding conversion: does nothing when performing the conversion.
+    #
+    # @return [EncodingEstimator::Conversion] A conversion object representing the default conversion
+    def self.default
+      EncodingEstimator::Conversion.new
+    end
     # Generate all conversions of for given encodings and operations. Note: this will produce
     # #encodings * #operations conversions if default is not included and #encoding * #operations + 1
     # if the default is included.

data/lib/encoding_estimator/distribution.rb CHANGED

@@ -37,7 +37,7 @@ module EncodingEstimator
       begin
         distribution = JSON.parse(
-            File.read( language.path )
+            File.read( language.path, encoding: 'utf-8' )
         )
       rescue Exception
         distribution = {}

data/lib/encoding_estimator/language_model.rb CHANGED

@@ -19,10 +19,10 @@ module EncodingEstimator
     #
     # @return [Boolean] true, if the referenced model file exists
     def valid?
-      if internal?
-        @language.to_s.size == 2 and File.file? internal_path
-      else
+      if external?
         File.file? external_path
+      else
+        @language.to_s.size == 2 and File.file? internal_path
       end
     end

data/lib/encoding_estimator/version.rb CHANGED

@@ -1,3 +1,3 @@
 module EncodingEstimator
-  VERSION = '0.1.2'
+  VERSION = '0.2.0'
 end

data/tutorials/interactive.md ADDED

@@ -0,0 +1,34 @@
+# Loading files of unknown encoding (interactive)
+We continue using the code snippet from [the non-interactive loading](./noninteractive.md). But this time, we don't use `EncodingEstimator.ensure_utf8` to convert the string automatically, but we want to let the user decide, if the detection was correct. Therefore, the `EncodingEstimator.detect` method is very useful. It does not convert the input, but gives you a detection result. It looks like that:
+```ruby
+cfg       = { languages: [:de], encodings: [ 'windows-1252' ] }
+detection = EncodingEstimator.detect( input, cfg )
+puts detection.result.encoding # e.g. 'utf-8' or 'windows-1252'
+```
+The `detect` method returns an `EncodingEstimator::Detection` instance. It contains information on how likely which encoding is. Using `EncodingEstimator::Detection.result` you will get the most probable conversion represented as an `EncodingEstimator::Conversion` object. So let's check the encoding we just detected:
+```ruby
+require 'csv'
+require 'encoding_estimator'
+cfg       = { languages: [:de], encodings: [ 'windows-1252' ] }
+content   = File.read( ARGV[ 0 ], encoding: 'utf-8' )
+detection = EncodingEstimator.detect( content, cfg )
+# Not the default encoding?
+unless detection.result.equals? EncodingEstimator::Conversion.default
+  puts "Detected encoding #{detection.result.encoding} on #{ARGV[0]}."
+  puts "Is this correct? (y/n)"
+  # If the user accepts, decode as the detected encoding
+  content = detection.result.perform content if STDIN.readline.strip == 'y'
+end
+CSV.parse( content ) do |row|
+  puts row[ 0 ]
+end
+```

data/tutorials/noninteractive.md ADDED

@@ -0,0 +1,34 @@
+# Loading files of unknown encoding (non-interactive)
+Let's say you have the following application which just reads a CSV file and prints the first line:
+```ruby
+require 'csv'
+content = File.read( ARGV[ 0 ], encoding: 'utf-8' )
+CSV.parse( content ) do |row|
+  puts row[ 0 ]
+end
+```
+*Note: yes, you could use `CSV.read` but this is easier to follow for developers not familiar with the `CSV` class. And please, don't use that snippet in production as there is no error handling at all.*
+So you want to ensure that the file you read is correctly encoded, because sometimes you may get these files in... let's say "interesting"... encodings, e.g. Windows-1252 in some Excel exports.
+Assume that you know that your little tool gets files containing German text encoded either as Windows-1252 or UTF-8. To handle both encodings correctly, we change the tool:
+```ruby
+require 'csv'
+require 'encoding_estimator'
+cfg     = { languages: [:de], encodings: [ 'windows-1252' ] }
+content = EncodingEstimator.ensure_utf8( File.read( ARGV[ 0 ], encoding: 'utf-8' ), cfg )
+CSV.parse( content ) do |row|
+  puts row[ 0 ]
+end
+```
+Looking at the 4th line, you might think: wait, there's only Windows-1252 listed in the configuration. What about UTF-8? UTF-8 is included by default, you can disable it via `include_default: false` in the configuration.
+The `EncodingEstimator.ensure_utf8` method gives you the input string encoded as whatever the gem detects as the best matching encoding.

data/tutorials/tutorial.md ADDED

@@ -0,0 +1,6 @@
+# Tutorials
+Learn how to use this gem in real world scenarios.
+* [Loading CSV files without user interaction](./noninteractive.md)
+* [Loading CSV files with user interaction](./interactive.md)

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: encoding_estimator
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Oskar Kirmis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-12-28 00:00:00.000000000 Z
+date: 2017-01-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -38,6 +38,48 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: parallel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov-parallel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: htmlentities
   requirement: !ruby/object:Gem::Requirement
@@ -123,6 +165,9 @@ files:
 - lib/encoding_estimator/language_model.rb
 - lib/encoding_estimator/parallel_support.rb
 - lib/encoding_estimator/version.rb
+- tutorials/interactive.md
+- tutorials/noninteractive.md
+- tutorials/tutorial.md
 homepage: https://github.com/okirmis/encoding_estimator
 licenses:
 - MIT