google_translate_diff 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/google_translate_diff/tokenizer.rb +32 -16
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 07c59774517f93999222ed67cd66eccb5ffad592
|
4
|
+
data.tar.gz: f75bd48f486fca6f1db63c1c188766b5112bc71c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c7f741db39feb6880cb927a17ebe96b65f6e9d43a2bd0680e9aadece192c8786e086ef1e34f49581f981e86f504242a98c1b6cd3636b02cca3c94af564d176d
|
7
|
+
data.tar.gz: 96e64720cba873ba0a7513c7a390507ef78846281ae3b2dac4545cc65f8af437405f144260204e80654a9ed59bfe373622f858f2f3dc4a732c16728ac870c092
|
@@ -25,9 +25,10 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def attr(name, value)
|
28
|
-
unless @context.last == :span
|
29
|
-
|
30
|
-
|
28
|
+
return unless @context.last == :span
|
29
|
+
return unless name == :class && value == "notranslate"
|
30
|
+
return if notranslate?
|
31
|
+
|
31
32
|
@sequence[-1] = :notranslate
|
32
33
|
end
|
33
34
|
|
@@ -38,7 +39,8 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
38
39
|
end
|
39
40
|
|
40
41
|
def tokens
|
41
|
-
@tokens ||= token_sequences_joined
|
42
|
+
@tokens ||= token_sequences_joined
|
43
|
+
.tap { |tokens| make_sentences_from_last_token(tokens) }
|
42
44
|
end
|
43
45
|
|
44
46
|
private
|
@@ -47,9 +49,11 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
47
49
|
raw_tokens.each_with_object([]) do |token, tokens|
|
48
50
|
if tokens.empty? # Initial state
|
49
51
|
tokens << token
|
50
|
-
elsif tokens.last[1] == token[1]
|
52
|
+
elsif tokens.last[1] == token[1]
|
53
|
+
# Join series of tokens of the same type into one
|
51
54
|
tokens.last[0].concat(token[0])
|
52
|
-
else
|
55
|
+
else
|
56
|
+
# If token before :markup is :text we need to split it into sentences
|
53
57
|
make_sentences_from_last_token(tokens)
|
54
58
|
tokens << token
|
55
59
|
end
|
@@ -81,36 +85,47 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
81
85
|
end
|
82
86
|
# rubocop:enable Metrics/MethodLength
|
83
87
|
|
88
|
+
# Whether the sequence is between `:notranslate` and `:end_notranslate`
|
89
|
+
def notranslate?
|
90
|
+
@sequence.select { |item| item[/notranslate/] }.last == :notranslate
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns the item for last opened span
|
94
|
+
def end_span
|
95
|
+
return :markup unless notranslate?
|
96
|
+
opened_spans = @sequence
|
97
|
+
.reverse
|
98
|
+
.take_while { |item| item != :notranslate }
|
99
|
+
.map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
|
100
|
+
.reduce(0, :+)
|
101
|
+
|
102
|
+
opened_spans.positive? ? :end_span : :end_notranslate
|
103
|
+
end
|
104
|
+
|
84
105
|
def raw_tokens
|
85
106
|
@raw_tokens ||= @indicies.map.with_index do |i, n|
|
86
107
|
first = i
|
87
108
|
last = (@indicies[n + 1] || 0) - 1
|
88
109
|
value = fix_utf(@source.byteslice(first..last))
|
89
110
|
type = @sequence[n]
|
90
|
-
type = :text if type
|
111
|
+
type = :text if INNER_SPANS.include?(type)
|
91
112
|
[value, type]
|
92
113
|
end
|
93
114
|
end
|
94
115
|
|
95
116
|
def fix_utf(value)
|
96
|
-
value.encode(
|
97
|
-
"UTF-8", undef: :replace, invalid: :replace, replace: " "
|
98
|
-
)
|
99
|
-
end
|
100
|
-
|
101
|
-
def nontranslate?(name)
|
102
|
-
@sequence[-2] == :notranslate && name == :span
|
117
|
+
value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
|
103
118
|
end
|
104
119
|
|
105
120
|
def start_markup(name)
|
106
121
|
@context << name
|
107
|
-
@sequence << :markup
|
122
|
+
@sequence << (notranslate? ? (name == :span ? :span : :text) : :markup)
|
108
123
|
@indicies << @pos - 1
|
109
124
|
end
|
110
125
|
|
111
126
|
def end_markup(name)
|
112
127
|
@context.pop
|
113
|
-
@sequence << (
|
128
|
+
@sequence << (notranslate? ? (name == :span ? end_span : :text) : :markup)
|
114
129
|
@indicies << @pos - 1 unless @pos == @source.bytesize
|
115
130
|
end
|
116
131
|
|
@@ -125,5 +140,6 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
125
140
|
end
|
126
141
|
|
127
142
|
SKIP = %i[script style].freeze
|
143
|
+
INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
|
128
144
|
HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
|
129
145
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|