google_translate_diff 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cbd316d964f6fdb9f119c66b2087e475bbed2b04
4
- data.tar.gz: 02ae88623b5b4ddddebbf9b23a9bb02075516d6a
3
+ metadata.gz: 07c59774517f93999222ed67cd66eccb5ffad592
4
+ data.tar.gz: f75bd48f486fca6f1db63c1c188766b5112bc71c
5
5
  SHA512:
6
- metadata.gz: 84d6f5f4287a586455900838a534760a1eec4755551ee920325336d6d513a0b59b0727acfadbcb84a82cc7a17b257bcce4c4ae94d4c8f1199948ce3a2b7be076
7
- data.tar.gz: da22b6c0d1ddafeff3243f0e6b4aec4f4394451f22b307a8582c4a3581330f565f029d0db30907f1ef58f3c470a7a378e255527e2735998afae3af4f36039a56
6
+ metadata.gz: 7c7f741db39feb6880cb927a17ebe96b65f6e9d43a2bd0680e9aadece192c8786e086ef1e34f49581f981e86f504242a98c1b6cd3636b02cca3c94af564d176d
7
+ data.tar.gz: 96e64720cba873ba0a7513c7a390507ef78846281ae3b2dac4545cc65f8af437405f144260204e80654a9ed59bfe373622f858f2f3dc4a732c16728ac870c092
@@ -25,9 +25,10 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
25
25
  end
26
26
 
27
27
  def attr(name, value)
28
- unless @context.last == :span && name == :class && value == "notranslate"
29
- return
30
- end
28
+ return unless @context.last == :span
29
+ return unless name == :class && value == "notranslate"
30
+ return if notranslate?
31
+
31
32
  @sequence[-1] = :notranslate
32
33
  end
33
34
 
@@ -38,7 +39,8 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
38
39
  end
39
40
 
40
41
  def tokens
41
- @tokens ||= token_sequences_joined.tap { |tokens| make_sentences_from_last_token(tokens) }
42
+ @tokens ||= token_sequences_joined
43
+ .tap { |tokens| make_sentences_from_last_token(tokens) }
42
44
  end
43
45
 
44
46
  private
@@ -47,9 +49,11 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
47
49
  raw_tokens.each_with_object([]) do |token, tokens|
48
50
  if tokens.empty? # Initial state
49
51
  tokens << token
50
- elsif tokens.last[1] == token[1] # Join series of tokens of the same type into one
52
+ elsif tokens.last[1] == token[1]
53
+ # Join series of tokens of the same type into one
51
54
  tokens.last[0].concat(token[0])
52
- else # If token before :markup is :text we need to split it into sentences
55
+ else
56
+ # If token before :markup is :text we need to split it into sentences
53
57
  make_sentences_from_last_token(tokens)
54
58
  tokens << token
55
59
  end
@@ -81,36 +85,47 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
81
85
  end
82
86
  # rubocop:enable Metrics/MethodLength
83
87
 
88
+ # Whether the sequence is between `:notranslate` and `:end_notranslate`
89
+ def notranslate?
90
+ @sequence.select { |item| item[/notranslate/] }.last == :notranslate
91
+ end
92
+
93
+ # Returns the item for last opened span
94
+ def end_span
95
+ return :markup unless notranslate?
96
+ opened_spans = @sequence
97
+ .reverse
98
+ .take_while { |item| item != :notranslate }
99
+ .map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
100
+ .reduce(0, :+)
101
+
102
+ opened_spans.positive? ? :end_span : :end_notranslate
103
+ end
104
+
84
105
  def raw_tokens
85
106
  @raw_tokens ||= @indicies.map.with_index do |i, n|
86
107
  first = i
87
108
  last = (@indicies[n + 1] || 0) - 1
88
109
  value = fix_utf(@source.byteslice(first..last))
89
110
  type = @sequence[n]
90
- type = :text if type == :notranslate
111
+ type = :text if INNER_SPANS.include?(type)
91
112
  [value, type]
92
113
  end
93
114
  end
94
115
 
95
116
  def fix_utf(value)
96
- value.encode(
97
- "UTF-8", undef: :replace, invalid: :replace, replace: " "
98
- )
99
- end
100
-
101
- def nontranslate?(name)
102
- @sequence[-2] == :notranslate && name == :span
117
+ value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
103
118
  end
104
119
 
105
120
  def start_markup(name)
106
121
  @context << name
107
- @sequence << :markup
122
+ @sequence << (notranslate? ? (name == :span ? :span : :text) : :markup)
108
123
  @indicies << @pos - 1
109
124
  end
110
125
 
111
126
  def end_markup(name)
112
127
  @context.pop
113
- @sequence << (nontranslate?(name) ? :notranslate : :markup)
128
+ @sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup)
114
129
  @indicies << @pos - 1 unless @pos == @source.bytesize
115
130
  end
116
131
 
@@ -125,5 +140,6 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
125
140
  end
126
141
 
127
142
  SKIP = %i[script style].freeze
143
+ INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
128
144
  HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
129
145
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleTranslateDiff
2
- VERSION = "1.0.10".freeze
2
+ VERSION = "1.0.11".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_translate_diff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Sokolov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-21 00:00:00.000000000 Z
11
+ date: 2019-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler