google_translate_diff 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/lib/google_translate_diff/tokenizer.rb +37 -15
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbd316d964f6fdb9f119c66b2087e475bbed2b04
|
4
|
+
data.tar.gz: 02ae88623b5b4ddddebbf9b23a9bb02075516d6a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84d6f5f4287a586455900838a534760a1eec4755551ee920325336d6d513a0b59b0727acfadbcb84a82cc7a17b257bcce4c4ae94d4c8f1199948ce3a2b7be076
|
7
|
+
data.tar.gz: da22b6c0d1ddafeff3243f0e6b4aec4f4394451f22b307a8582c4a3581330f565f029d0db30907f1ef58f3c470a7a378e255527e2735998afae3af4f36039a56
|
data/.rubocop.yml
CHANGED
@@ -8,16 +8,20 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
8
8
|
@indicies = []
|
9
9
|
end
|
10
10
|
|
11
|
+
def instruct(target)
|
12
|
+
start_markup(target)
|
13
|
+
end
|
14
|
+
|
15
|
+
def end_instruct(target)
|
16
|
+
end_markup(target)
|
17
|
+
end
|
18
|
+
|
11
19
|
def start_element(name)
|
12
|
-
|
13
|
-
@sequence << :markup
|
14
|
-
@indicies << @pos - 1
|
20
|
+
start_markup(name)
|
15
21
|
end
|
16
22
|
|
17
23
|
def end_element(name)
|
18
|
-
|
19
|
-
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
20
|
-
@indicies << @pos - 1 unless @pos == @source.bytesize
|
24
|
+
end_markup(name)
|
21
25
|
end
|
22
26
|
|
23
27
|
def attr(name, value)
|
@@ -33,22 +37,29 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
33
37
|
@indicies << @pos - 1
|
34
38
|
end
|
35
39
|
|
36
|
-
# rubocop:disable Metrics/AbcSize
|
37
40
|
def tokens
|
38
|
-
@tokens ||=
|
39
|
-
|
41
|
+
@tokens ||= token_sequences_joined.tap { |tokens| make_sentences_from_last_token(tokens) }
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def token_sequences_joined
|
47
|
+
raw_tokens.each_with_object([]) do |token, tokens|
|
48
|
+
if tokens.empty? # Initial state
|
40
49
|
tokens << token
|
41
|
-
elsif tokens.last[1] == token[1]
|
50
|
+
elsif tokens.last[1] == token[1] # Join series of tokens of the same type into one
|
42
51
|
tokens.last[0].concat(token[0])
|
43
|
-
else
|
44
|
-
|
52
|
+
else # If token before :markup is :text we need to split it into sentences
|
53
|
+
make_sentences_from_last_token(tokens)
|
45
54
|
tokens << token
|
46
55
|
end
|
47
56
|
end
|
48
57
|
end
|
49
|
-
# rubocop:enable Metrics/AbcSize
|
50
58
|
|
51
|
-
|
59
|
+
def make_sentences_from_last_token(tokens)
|
60
|
+
return if tokens.empty?
|
61
|
+
tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
|
62
|
+
end
|
52
63
|
|
53
64
|
# rubocop: disable Metrics/MethodLength
|
54
65
|
def sentences(value)
|
@@ -91,13 +102,24 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
91
102
|
@sequence[-2] == :notranslate && name == :span
|
92
103
|
end
|
93
104
|
|
105
|
+
def start_markup(name)
|
106
|
+
@context << name
|
107
|
+
@sequence << :markup
|
108
|
+
@indicies << @pos - 1
|
109
|
+
end
|
110
|
+
|
111
|
+
def end_markup(name)
|
112
|
+
@context.pop
|
113
|
+
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
114
|
+
@indicies << @pos - 1 unless @pos == @source.bytesize
|
115
|
+
end
|
116
|
+
|
94
117
|
class << self
|
95
118
|
def tokenize(value)
|
96
119
|
return [] if value.nil?
|
97
120
|
tokenizer = new(value).tap do |h|
|
98
121
|
Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
|
99
122
|
end
|
100
|
-
puts tokenizer.tokens.inspect
|
101
123
|
tokenizer.tokens
|
102
124
|
end
|
103
125
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|