google_translate_diff 1.0.10 → 1.0.11

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cbd316d964f6fdb9f119c66b2087e475bbed2b04
4
- data.tar.gz: 02ae88623b5b4ddddebbf9b23a9bb02075516d6a
3
+ metadata.gz: 07c59774517f93999222ed67cd66eccb5ffad592
4
+ data.tar.gz: f75bd48f486fca6f1db63c1c188766b5112bc71c
5
5
  SHA512:
6
- metadata.gz: 84d6f5f4287a586455900838a534760a1eec4755551ee920325336d6d513a0b59b0727acfadbcb84a82cc7a17b257bcce4c4ae94d4c8f1199948ce3a2b7be076
7
- data.tar.gz: da22b6c0d1ddafeff3243f0e6b4aec4f4394451f22b307a8582c4a3581330f565f029d0db30907f1ef58f3c470a7a378e255527e2735998afae3af4f36039a56
6
+ metadata.gz: 7c7f741db39feb6880cb927a17ebe96b65f6e9d43a2bd0680e9aadece192c8786e086ef1e34f49581f981e86f504242a98c1b6cd3636b02cca3c94af564d176d
7
+ data.tar.gz: 96e64720cba873ba0a7513c7a390507ef78846281ae3b2dac4545cc65f8af437405f144260204e80654a9ed59bfe373622f858f2f3dc4a732c16728ac870c092
@@ -25,9 +25,10 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
25
25
  end
26
26
 
27
27
  def attr(name, value)
28
- unless @context.last == :span && name == :class && value == "notranslate"
29
- return
30
- end
28
+ return unless @context.last == :span
29
+ return unless name == :class && value == "notranslate"
30
+ return if notranslate?
31
+
31
32
  @sequence[-1] = :notranslate
32
33
  end
33
34
 
@@ -38,7 +39,8 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
38
39
  end
39
40
 
40
41
  def tokens
41
- @tokens ||= token_sequences_joined.tap { |tokens| make_sentences_from_last_token(tokens) }
42
+ @tokens ||= token_sequences_joined
43
+ .tap { |tokens| make_sentences_from_last_token(tokens) }
42
44
  end
43
45
 
44
46
  private
@@ -47,9 +49,11 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
47
49
  raw_tokens.each_with_object([]) do |token, tokens|
48
50
  if tokens.empty? # Initial state
49
51
  tokens << token
50
- elsif tokens.last[1] == token[1] # Join series of tokens of the same type into one
52
+ elsif tokens.last[1] == token[1]
53
+ # Join series of tokens of the same type into one
51
54
  tokens.last[0].concat(token[0])
52
- else # If token before :markup is :text we need to split it into sentences
55
+ else
56
+ # If token before :markup is :text we need to split it into sentences
53
57
  make_sentences_from_last_token(tokens)
54
58
  tokens << token
55
59
  end
@@ -81,36 +85,47 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
81
85
  end
82
86
  # rubocop:enable Metrics/MethodLength
83
87
 
88
+ # Whether the sequence is between `:notranslate` and `:end_notranslate`
89
+ def notranslate?
90
+ @sequence.select { |item| item[/notranslate/] }.last == :notranslate
91
+ end
92
+
93
+ # Returns the item for last opened span
94
+ def end_span
95
+ return :markup unless notranslate?
96
+ opened_spans = @sequence
97
+ .reverse
98
+ .take_while { |item| item != :notranslate }
99
+ .map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
100
+ .reduce(0, :+)
101
+
102
+ opened_spans.positive? ? :end_span : :end_notranslate
103
+ end
104
+
84
105
  def raw_tokens
85
106
  @raw_tokens ||= @indicies.map.with_index do |i, n|
86
107
  first = i
87
108
  last = (@indicies[n + 1] || 0) - 1
88
109
  value = fix_utf(@source.byteslice(first..last))
89
110
  type = @sequence[n]
90
- type = :text if type == :notranslate
111
+ type = :text if INNER_SPANS.include?(type)
91
112
  [value, type]
92
113
  end
93
114
  end
94
115
 
95
116
  def fix_utf(value)
96
- value.encode(
97
- "UTF-8", undef: :replace, invalid: :replace, replace: " "
98
- )
99
- end
100
-
101
- def nontranslate?(name)
102
- @sequence[-2] == :notranslate && name == :span
117
+ value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
103
118
  end
104
119
 
105
120
  def start_markup(name)
106
121
  @context << name
107
- @sequence << :markup
122
+ @sequence << (notranslate? ? (name == :span ? :span : :text) : :markup)
108
123
  @indicies << @pos - 1
109
124
  end
110
125
 
111
126
  def end_markup(name)
112
127
  @context.pop
113
- @sequence << (nontranslate?(name) ? :notranslate : :markup)
128
+ @sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup)
114
129
  @indicies << @pos - 1 unless @pos == @source.bytesize
115
130
  end
116
131
 
@@ -125,5 +140,6 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
125
140
  end
126
141
 
127
142
  SKIP = %i[script style].freeze
143
+ INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
128
144
  HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
129
145
  end
@@ -1,3 +1,3 @@
1
1
  module GoogleTranslateDiff
2
- VERSION = "1.0.10".freeze
2
+ VERSION = "1.0.11".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: google_translate_diff
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Victor Sokolov
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-21 00:00:00.000000000 Z
11
+ date: 2019-01-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler