google_translate_diff 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0d3f09238c1e758c969c8e32d3c19997aee401c8
4
- data.tar.gz: 3a9d0a82e79b6f0b7195972d38468d29883adb34
3
+ metadata.gz: f06bc3ed14da7bb671f3afc188375f72da6b1026
4
+ data.tar.gz: 62af00a1a43df3f8c1773346709c3f09e95cf186
5
5
  SHA512:
6
- metadata.gz: 82046bb8370f241ed386c0f7c9e82438afd45f4e86a7aed6ebaf9090c090f7ff5c07fc5f90aa516856efcb1e1a9b0b29088971a0e279674fddf0a1349c3f4f89
7
- data.tar.gz: ac93211c285e3b492e593b89ad28a04dd5af7997a1ed6f5a3daf07e38e0aea2d5d12b1b8383e31f7897d1702cd6087375a349023e7a6b67bda3345a8d8439b54
6
+ metadata.gz: f2662f618fc76e10877f78ca07dbce8bfda0b0f2c08da06643ea2d91491354631d15d44137f5fb76a33770fc59becb442471092fd030281e28c17d05722e7f1c
7
+ data.tar.gz: 39ddf3a505c96560016e24a9f3bd303e8e36556894e7aea7c612ce6c9078266ef1e4b7756f807e256507bf40105bcc226bab80f65273798b004b4756f8b0dd31
@@ -44,7 +44,7 @@ between revisions of long texts.
44
44
  spec.add_development_dependency "simplecov"
45
45
 
46
46
  spec.add_dependency "google-cloud-translate"
47
- spec.add_dependency "ox" #, "< 2.5"
47
+ spec.add_dependency "ox"
48
48
  spec.add_dependency "dry-initializer"
49
49
  spec.add_dependency "punkt-segmenter"
50
50
  end
@@ -1,40 +1,58 @@
1
1
  class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
2
2
  def initialize(source)
3
3
  @pos = nil
4
- @prev = 1
5
- @skip = false
6
4
  @source = source
7
5
  @tokens = []
6
+ @context = []
7
+ @sequence = []
8
+ @indicies = []
8
9
  end
9
10
 
10
- attr_reader :texts, :tokens, :prev, :pos
11
-
12
11
  def start_element(name)
13
- @skip = true if SKIP.include?(name)
12
+ @context << name
13
+ @sequence << :markup
14
+ @indicies << @pos - 1
14
15
  end
15
16
 
16
17
  def end_element(name)
17
- @skip = false if SKIP.include?(name)
18
+ @context.pop
19
+ @sequence << (nontranslate?(name) ? :notranslate : :markup)
20
+ @indicies << @pos - 1 unless @pos == @source.bytesize
18
21
  end
19
22
 
20
- def text(value)
21
- return if @skip
22
- value = fix_utf(value)
23
- return if value.strip.empty?
24
-
25
- token.tap { |t| @tokens << [fix_utf(t), :markup] if t }
26
- @tokens.concat(sentences(value))
23
+ def attr(name, value)
24
+ unless @context.last == :span && name == :class && value == "notranslate"
25
+ return
26
+ end
27
+ @sequence[-1] = :notranslate
28
+ end
27
29
 
28
- @prev = @pos + value.bytesize
30
+ def text(_)
31
+ @sequence << (SKIP.include?(@context.last) ? :markup : :text)
32
+ @indicies << @pos - 1
29
33
  end
30
34
 
31
- def token
32
- return if @prev == @pos
33
- fix_utf(@source.byteslice((@prev - 1)..(@pos - 2)))
35
+ # rubocop:disable Metrics/AbcSize
36
+ def tokens
37
+ raw_tokens.each_with_object([]) do |token, tokens|
38
+ if tokens.empty?
39
+ tokens << token
40
+ elsif tokens.last[1] == token[1]
41
+ tokens.last[0].concat(token[0])
42
+ else
43
+ tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
44
+ tokens << token
45
+ end
46
+ end
34
47
  end
48
+ # rubocop:enable Metrics/AbcSize
35
49
 
36
- # Splits text by sentences
50
+ private
51
+
52
+ # rubocop: disable Metrics/MethodLength
37
53
  def sentences(value)
54
+ return [] if value.empty?
55
+
38
56
  boundaries =
39
57
  Punkt::SentenceTokenizer
40
58
  .new(value)
@@ -49,10 +67,17 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
49
67
  [value[left..right], :text]
50
68
  end
51
69
  end
70
+ # rubocop:enable Metrics/MethodLength
52
71
 
53
- def cut_last_token
54
- last_token = fix_utf(@source.byteslice((@prev - 1)..-1))
55
- @tokens << [last_token, :markup] if last_token != ""
72
+ def raw_tokens
73
+ @indicies.map.with_index do |i, n|
74
+ first = i
75
+ last = (@indicies[n + 1] || 0) - 1
76
+ value = fix_utf(@source.byteslice(first..last))
77
+ type = @sequence[n]
78
+ type = :text if type == :notranslate
79
+ [value, type]
80
+ end
56
81
  end
57
82
 
58
83
  def fix_utf(value)
@@ -61,12 +86,15 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
61
86
  )
62
87
  end
63
88
 
89
+ def nontranslate?(name)
90
+ @sequence[-2] == :notranslate && name == :span
91
+ end
92
+
64
93
  class << self
65
94
  def tokenize(value)
66
95
  return [] if value.nil?
67
96
  tokenizer = new(value).tap do |h|
68
97
  Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
69
- h.cut_last_token
70
98
  end
71
99
  tokenizer.tokens
72
100
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleTranslateDiff
2
- VERSION = "1.0.6".freeze
2
+ VERSION = "1.0.7".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_translate_diff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.6
4
+ version: 1.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Sokolov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-08-30 00:00:00.000000000 Z
11
+ date: 2017-08-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler