google_translate_diff 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0d3f09238c1e758c969c8e32d3c19997aee401c8
4
- data.tar.gz: 3a9d0a82e79b6f0b7195972d38468d29883adb34
3
+ metadata.gz: f06bc3ed14da7bb671f3afc188375f72da6b1026
4
+ data.tar.gz: 62af00a1a43df3f8c1773346709c3f09e95cf186
5
5
  SHA512:
6
- metadata.gz: 82046bb8370f241ed386c0f7c9e82438afd45f4e86a7aed6ebaf9090c090f7ff5c07fc5f90aa516856efcb1e1a9b0b29088971a0e279674fddf0a1349c3f4f89
7
- data.tar.gz: ac93211c285e3b492e593b89ad28a04dd5af7997a1ed6f5a3daf07e38e0aea2d5d12b1b8383e31f7897d1702cd6087375a349023e7a6b67bda3345a8d8439b54
6
+ metadata.gz: f2662f618fc76e10877f78ca07dbce8bfda0b0f2c08da06643ea2d91491354631d15d44137f5fb76a33770fc59becb442471092fd030281e28c17d05722e7f1c
7
+ data.tar.gz: 39ddf3a505c96560016e24a9f3bd303e8e36556894e7aea7c612ce6c9078266ef1e4b7756f807e256507bf40105bcc226bab80f65273798b004b4756f8b0dd31
@@ -44,7 +44,7 @@ between revisions of long texts.
44
44
  spec.add_development_dependency "simplecov"
45
45
 
46
46
  spec.add_dependency "google-cloud-translate"
47
- spec.add_dependency "ox" #, "< 2.5"
47
+ spec.add_dependency "ox"
48
48
  spec.add_dependency "dry-initializer"
49
49
  spec.add_dependency "punkt-segmenter"
50
50
  end
@@ -1,40 +1,58 @@
1
1
  class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
2
2
  def initialize(source)
3
3
  @pos = nil
4
- @prev = 1
5
- @skip = false
6
4
  @source = source
7
5
  @tokens = []
6
+ @context = []
7
+ @sequence = []
8
+ @indicies = []
8
9
  end
9
10
 
10
- attr_reader :texts, :tokens, :prev, :pos
11
-
12
11
  def start_element(name)
13
- @skip = true if SKIP.include?(name)
12
+ @context << name
13
+ @sequence << :markup
14
+ @indicies << @pos - 1
14
15
  end
15
16
 
16
17
  def end_element(name)
17
- @skip = false if SKIP.include?(name)
18
+ @context.pop
19
+ @sequence << (nontranslate?(name) ? :notranslate : :markup)
20
+ @indicies << @pos - 1 unless @pos == @source.bytesize
18
21
  end
19
22
 
20
- def text(value)
21
- return if @skip
22
- value = fix_utf(value)
23
- return if value.strip.empty?
24
-
25
- token.tap { |t| @tokens << [fix_utf(t), :markup] if t }
26
- @tokens.concat(sentences(value))
23
+ def attr(name, value)
24
+ unless @context.last == :span && name == :class && value == "notranslate"
25
+ return
26
+ end
27
+ @sequence[-1] = :notranslate
28
+ end
27
29
 
28
- @prev = @pos + value.bytesize
30
+ def text(_)
31
+ @sequence << (SKIP.include?(@context.last) ? :markup : :text)
32
+ @indicies << @pos - 1
29
33
  end
30
34
 
31
- def token
32
- return if @prev == @pos
33
- fix_utf(@source.byteslice((@prev - 1)..(@pos - 2)))
35
+ # rubocop:disable Metrics/AbcSize
36
+ def tokens
37
+ raw_tokens.each_with_object([]) do |token, tokens|
38
+ if tokens.empty?
39
+ tokens << token
40
+ elsif tokens.last[1] == token[1]
41
+ tokens.last[0].concat(token[0])
42
+ else
43
+ tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
44
+ tokens << token
45
+ end
46
+ end
34
47
  end
48
+ # rubocop:enable Metrics/AbcSize
35
49
 
36
- # Splits text by sentences
50
+ private
51
+
52
+ # rubocop: disable Metrics/MethodLength
37
53
  def sentences(value)
54
+ return [] if value.empty?
55
+
38
56
  boundaries =
39
57
  Punkt::SentenceTokenizer
40
58
  .new(value)
@@ -49,10 +67,17 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
49
67
  [value[left..right], :text]
50
68
  end
51
69
  end
70
+ # rubocop:enable Metrics/MethodLength
52
71
 
53
- def cut_last_token
54
- last_token = fix_utf(@source.byteslice((@prev - 1)..-1))
55
- @tokens << [last_token, :markup] if last_token != ""
72
+ def raw_tokens
73
+ @indicies.map.with_index do |i, n|
74
+ first = i
75
+ last = (@indicies[n + 1] || 0) - 1
76
+ value = fix_utf(@source.byteslice(first..last))
77
+ type = @sequence[n]
78
+ type = :text if type == :notranslate
79
+ [value, type]
80
+ end
56
81
  end
57
82
 
58
83
  def fix_utf(value)
@@ -61,12 +86,15 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
61
86
  )
62
87
  end
63
88
 
89
+ def nontranslate?(name)
90
+ @sequence[-2] == :notranslate && name == :span
91
+ end
92
+
64
93
  class << self
65
94
  def tokenize(value)
66
95
  return [] if value.nil?
67
96
  tokenizer = new(value).tap do |h|
68
97
  Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
69
- h.cut_last_token
70
98
  end
71
99
  tokenizer.tokens
72
100
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleTranslateDiff
2
- VERSION = "1.0.6".freeze
2
+ VERSION = "1.0.7".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_translate_diff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Sokolov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-08-30 00:00:00.000000000 Z
11
+ date: 2017-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler