google_translate_diff 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/google_translate_diff.gemspec +1 -1
 - data/lib/google_translate_diff/tokenizer.rb +50 -22
 - data/lib/google_translate_diff/version.rb +1 -1
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: f06bc3ed14da7bb671f3afc188375f72da6b1026
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 62af00a1a43df3f8c1773346709c3f09e95cf186
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: f2662f618fc76e10877f78ca07dbce8bfda0b0f2c08da06643ea2d91491354631d15d44137f5fb76a33770fc59becb442471092fd030281e28c17d05722e7f1c
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 39ddf3a505c96560016e24a9f3bd303e8e36556894e7aea7c612ce6c9078266ef1e4b7756f807e256507bf40105bcc226bab80f65273798b004b4756f8b0dd31
         
     | 
| 
         @@ -44,7 +44,7 @@ between revisions of long texts. 
     | 
|
| 
       44 
44 
     | 
    
         
             
              spec.add_development_dependency "simplecov"
         
     | 
| 
       45 
45 
     | 
    
         | 
| 
       46 
46 
     | 
    
         
             
              spec.add_dependency "google-cloud-translate"
         
     | 
| 
       47 
     | 
    
         
            -
              spec.add_dependency "ox" 
     | 
| 
      
 47 
     | 
    
         
            +
              spec.add_dependency "ox"
         
     | 
| 
       48 
48 
     | 
    
         
             
              spec.add_dependency "dry-initializer"
         
     | 
| 
       49 
49 
     | 
    
         
             
              spec.add_dependency "punkt-segmenter"
         
     | 
| 
       50 
50 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,40 +1,58 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
         
     | 
| 
       2 
2 
     | 
    
         
             
              def initialize(source)
         
     | 
| 
       3 
3 
     | 
    
         
             
                @pos = nil
         
     | 
| 
       4 
     | 
    
         
            -
                @prev = 1
         
     | 
| 
       5 
     | 
    
         
            -
                @skip = false
         
     | 
| 
       6 
4 
     | 
    
         
             
                @source = source
         
     | 
| 
       7 
5 
     | 
    
         
             
                @tokens = []
         
     | 
| 
      
 6 
     | 
    
         
            +
                @context = []
         
     | 
| 
      
 7 
     | 
    
         
            +
                @sequence = []
         
     | 
| 
      
 8 
     | 
    
         
            +
                @indicies = []
         
     | 
| 
       8 
9 
     | 
    
         
             
              end
         
     | 
| 
       9 
10 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
              attr_reader :texts, :tokens, :prev, :pos
         
     | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
11 
     | 
    
         
             
              def start_element(name)
         
     | 
| 
       13 
     | 
    
         
            -
                @ 
     | 
| 
      
 12 
     | 
    
         
            +
                @context << name
         
     | 
| 
      
 13 
     | 
    
         
            +
                @sequence << :markup
         
     | 
| 
      
 14 
     | 
    
         
            +
                @indicies << @pos - 1
         
     | 
| 
       14 
15 
     | 
    
         
             
              end
         
     | 
| 
       15 
16 
     | 
    
         | 
| 
       16 
17 
     | 
    
         
             
              def end_element(name)
         
     | 
| 
       17 
     | 
    
         
            -
                @ 
     | 
| 
      
 18 
     | 
    
         
            +
                @context.pop
         
     | 
| 
      
 19 
     | 
    
         
            +
                @sequence << (nontranslate?(name) ? :notranslate : :markup)
         
     | 
| 
      
 20 
     | 
    
         
            +
                @indicies << @pos - 1 unless @pos == @source.bytesize
         
     | 
| 
       18 
21 
     | 
    
         
             
              end
         
     | 
| 
       19 
22 
     | 
    
         | 
| 
       20 
     | 
    
         
            -
              def  
     | 
| 
       21 
     | 
    
         
            -
                 
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
       23 
     | 
    
         
            -
                 
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
             
     | 
| 
       26 
     | 
    
         
            -
                @tokens.concat(sentences(value))
         
     | 
| 
      
 23 
     | 
    
         
            +
              def attr(name, value)
         
     | 
| 
      
 24 
     | 
    
         
            +
                unless @context.last == :span && name == :class && value == "notranslate"
         
     | 
| 
      
 25 
     | 
    
         
            +
                  return
         
     | 
| 
      
 26 
     | 
    
         
            +
                end
         
     | 
| 
      
 27 
     | 
    
         
            +
                @sequence[-1] = :notranslate
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
       27 
29 
     | 
    
         | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
      
 30 
     | 
    
         
            +
              def text(_)
         
     | 
| 
      
 31 
     | 
    
         
            +
                @sequence << (SKIP.include?(@context.last) ? :markup : :text)
         
     | 
| 
      
 32 
     | 
    
         
            +
                @indicies << @pos - 1
         
     | 
| 
       29 
33 
     | 
    
         
             
              end
         
     | 
| 
       30 
34 
     | 
    
         | 
| 
       31 
     | 
    
         
            -
               
     | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
       33 
     | 
    
         
            -
                 
     | 
| 
      
 35 
     | 
    
         
            +
              # rubocop:disable Metrics/AbcSize
         
     | 
| 
      
 36 
     | 
    
         
            +
              def tokens
         
     | 
| 
      
 37 
     | 
    
         
            +
                raw_tokens.each_with_object([]) do |token, tokens|
         
     | 
| 
      
 38 
     | 
    
         
            +
                  if tokens.empty?
         
     | 
| 
      
 39 
     | 
    
         
            +
                    tokens << token
         
     | 
| 
      
 40 
     | 
    
         
            +
                  elsif tokens.last[1] == token[1]
         
     | 
| 
      
 41 
     | 
    
         
            +
                    tokens.last[0].concat(token[0])
         
     | 
| 
      
 42 
     | 
    
         
            +
                  else
         
     | 
| 
      
 43 
     | 
    
         
            +
                    tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
         
     | 
| 
      
 44 
     | 
    
         
            +
                    tokens << token
         
     | 
| 
      
 45 
     | 
    
         
            +
                  end
         
     | 
| 
      
 46 
     | 
    
         
            +
                end
         
     | 
| 
       34 
47 
     | 
    
         
             
              end
         
     | 
| 
      
 48 
     | 
    
         
            +
              # rubocop:enable Metrics/AbcSize
         
     | 
| 
       35 
49 
     | 
    
         | 
| 
       36 
     | 
    
         
            -
               
     | 
| 
      
 50 
     | 
    
         
            +
              private
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
              # rubocop: disable Metrics/MethodLength
         
     | 
| 
       37 
53 
     | 
    
         
             
              def sentences(value)
         
     | 
| 
      
 54 
     | 
    
         
            +
                return [] if value.empty?
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
       38 
56 
     | 
    
         
             
                boundaries =
         
     | 
| 
       39 
57 
     | 
    
         
             
                  Punkt::SentenceTokenizer
         
     | 
| 
       40 
58 
     | 
    
         
             
                  .new(value)
         
     | 
| 
         @@ -49,10 +67,17 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax 
     | 
|
| 
       49 
67 
     | 
    
         
             
                  [value[left..right], :text]
         
     | 
| 
       50 
68 
     | 
    
         
             
                end
         
     | 
| 
       51 
69 
     | 
    
         
             
              end
         
     | 
| 
      
 70 
     | 
    
         
            +
              # rubocop:enable Metrics/MethodLength
         
     | 
| 
       52 
71 
     | 
    
         | 
| 
       53 
     | 
    
         
            -
              def  
     | 
| 
       54 
     | 
    
         
            -
                 
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
      
 72 
     | 
    
         
            +
              def raw_tokens
         
     | 
| 
      
 73 
     | 
    
         
            +
                @indicies.map.with_index do |i, n|
         
     | 
| 
      
 74 
     | 
    
         
            +
                  first = i
         
     | 
| 
      
 75 
     | 
    
         
            +
                  last = (@indicies[n + 1] || 0) - 1
         
     | 
| 
      
 76 
     | 
    
         
            +
                  value = fix_utf(@source.byteslice(first..last))
         
     | 
| 
      
 77 
     | 
    
         
            +
                  type = @sequence[n]
         
     | 
| 
      
 78 
     | 
    
         
            +
                  type = :text if type == :notranslate
         
     | 
| 
      
 79 
     | 
    
         
            +
                  [value, type]
         
     | 
| 
      
 80 
     | 
    
         
            +
                end
         
     | 
| 
       56 
81 
     | 
    
         
             
              end
         
     | 
| 
       57 
82 
     | 
    
         | 
| 
       58 
83 
     | 
    
         
             
              def fix_utf(value)
         
     | 
| 
         @@ -61,12 +86,15 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax 
     | 
|
| 
       61 
86 
     | 
    
         
             
                )
         
     | 
| 
       62 
87 
     | 
    
         
             
              end
         
     | 
| 
       63 
88 
     | 
    
         | 
| 
      
 89 
     | 
    
         
            +
              def nontranslate?(name)
         
     | 
| 
      
 90 
     | 
    
         
            +
                @sequence[-2] == :notranslate && name == :span
         
     | 
| 
      
 91 
     | 
    
         
            +
              end
         
     | 
| 
      
 92 
     | 
    
         
            +
             
     | 
| 
       64 
93 
     | 
    
         
             
              class << self
         
     | 
| 
       65 
94 
     | 
    
         
             
                def tokenize(value)
         
     | 
| 
       66 
95 
     | 
    
         
             
                  return [] if value.nil?
         
     | 
| 
       67 
96 
     | 
    
         
             
                  tokenizer = new(value).tap do |h|
         
     | 
| 
       68 
97 
     | 
    
         
             
                    Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
         
     | 
| 
       69 
     | 
    
         
            -
                    h.cut_last_token
         
     | 
| 
       70 
98 
     | 
    
         
             
                  end
         
     | 
| 
       71 
99 
     | 
    
         
             
                  tokenizer.tokens
         
     | 
| 
       72 
100 
     | 
    
         
             
                end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: google_translate_diff
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 1.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 1.0.7
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Victor Sokolov
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2017-08- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2017-08-31 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: bundler
         
     |