google_translate_diff 1.0.10 → 1.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/google_translate_diff/tokenizer.rb +32 -16
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c59774517f93999222ed67cd66eccb5ffad592
|
4
|
+
data.tar.gz: f75bd48f486fca6f1db63c1c188766b5112bc71c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c7f741db39feb6880cb927a17ebe96b65f6e9d43a2bd0680e9aadece192c8786e086ef1e34f49581f981e86f504242a98c1b6cd3636b02cca3c94af564d176d
|
7
|
+
data.tar.gz: 96e64720cba873ba0a7513c7a390507ef78846281ae3b2dac4545cc65f8af437405f144260204e80654a9ed59bfe373622f858f2f3dc4a732c16728ac870c092
|
@@ -25,9 +25,10 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def attr(name, value)
|
28
|
-
unless @context.last == :span
|
29
|
-
|
30
|
-
|
28
|
+
return unless @context.last == :span
|
29
|
+
return unless name == :class && value == "notranslate"
|
30
|
+
return if notranslate?
|
31
|
+
|
31
32
|
@sequence[-1] = :notranslate
|
32
33
|
end
|
33
34
|
|
@@ -38,7 +39,8 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
38
39
|
end
|
39
40
|
|
40
41
|
def tokens
|
41
|
-
@tokens ||= token_sequences_joined
|
42
|
+
@tokens ||= token_sequences_joined
|
43
|
+
.tap { |tokens| make_sentences_from_last_token(tokens) }
|
42
44
|
end
|
43
45
|
|
44
46
|
private
|
@@ -47,9 +49,11 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
47
49
|
raw_tokens.each_with_object([]) do |token, tokens|
|
48
50
|
if tokens.empty? # Initial state
|
49
51
|
tokens << token
|
50
|
-
elsif tokens.last[1] == token[1]
|
52
|
+
elsif tokens.last[1] == token[1]
|
53
|
+
# Join series of tokens of the same type into one
|
51
54
|
tokens.last[0].concat(token[0])
|
52
|
-
else
|
55
|
+
else
|
56
|
+
# If token before :markup is :text we need to split it into sentences
|
53
57
|
make_sentences_from_last_token(tokens)
|
54
58
|
tokens << token
|
55
59
|
end
|
@@ -81,36 +85,47 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
81
85
|
end
|
82
86
|
# rubocop:enable Metrics/MethodLength
|
83
87
|
|
88
|
+
# Whether the sequence is between `:notranslate` and `:end_notranslate`
|
89
|
+
def notranslate?
|
90
|
+
@sequence.select { |item| item[/notranslate/] }.last == :notranslate
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns the item for last opened span
|
94
|
+
def end_span
|
95
|
+
return :markup unless notranslate?
|
96
|
+
opened_spans = @sequence
|
97
|
+
.reverse
|
98
|
+
.take_while { |item| item != :notranslate }
|
99
|
+
.map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
|
100
|
+
.reduce(0, :+)
|
101
|
+
|
102
|
+
opened_spans.positive? ? :end_span : :end_notranslate
|
103
|
+
end
|
104
|
+
|
84
105
|
def raw_tokens
|
85
106
|
@raw_tokens ||= @indicies.map.with_index do |i, n|
|
86
107
|
first = i
|
87
108
|
last = (@indicies[n + 1] || 0) - 1
|
88
109
|
value = fix_utf(@source.byteslice(first..last))
|
89
110
|
type = @sequence[n]
|
90
|
-
type = :text if type
|
111
|
+
type = :text if INNER_SPANS.include?(type)
|
91
112
|
[value, type]
|
92
113
|
end
|
93
114
|
end
|
94
115
|
|
95
116
|
def fix_utf(value)
|
96
|
-
value.encode(
|
97
|
-
"UTF-8", undef: :replace, invalid: :replace, replace: " "
|
98
|
-
)
|
99
|
-
end
|
100
|
-
|
101
|
-
def nontranslate?(name)
|
102
|
-
@sequence[-2] == :notranslate && name == :span
|
117
|
+
value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
|
103
118
|
end
|
104
119
|
|
105
120
|
def start_markup(name)
|
106
121
|
@context << name
|
107
|
-
@sequence << :markup
|
122
|
+
@sequence << (notranslate? ? (name == :span ? :span : :text) : :markup)
|
108
123
|
@indicies << @pos - 1
|
109
124
|
end
|
110
125
|
|
111
126
|
def end_markup(name)
|
112
127
|
@context.pop
|
113
|
-
@sequence << (
|
128
|
+
@sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup)
|
114
129
|
@indicies << @pos - 1 unless @pos == @source.bytesize
|
115
130
|
end
|
116
131
|
|
@@ -125,5 +140,6 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
125
140
|
end
|
126
141
|
|
127
142
|
SKIP = %i[script style].freeze
|
143
|
+
INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
|
128
144
|
HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
|
129
145
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|