google_translate_diff 1.0.9 → 1.0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -0
- data/lib/google_translate_diff/tokenizer.rb +37 -15
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbd316d964f6fdb9f119c66b2087e475bbed2b04
|
4
|
+
data.tar.gz: 02ae88623b5b4ddddebbf9b23a9bb02075516d6a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84d6f5f4287a586455900838a534760a1eec4755551ee920325336d6d513a0b59b0727acfadbcb84a82cc7a17b257bcce4c4ae94d4c8f1199948ce3a2b7be076
|
7
|
+
data.tar.gz: da22b6c0d1ddafeff3243f0e6b4aec4f4394451f22b307a8582c4a3581330f565f029d0db30907f1ef58f3c470a7a378e255527e2735998afae3af4f36039a56
|
data/.rubocop.yml
CHANGED
@@ -8,16 +8,20 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
8
8
|
@indicies = []
|
9
9
|
end
|
10
10
|
|
11
|
+
def instruct(target)
|
12
|
+
start_markup(target)
|
13
|
+
end
|
14
|
+
|
15
|
+
def end_instruct(target)
|
16
|
+
end_markup(target)
|
17
|
+
end
|
18
|
+
|
11
19
|
def start_element(name)
|
12
|
-
|
13
|
-
@sequence << :markup
|
14
|
-
@indicies << @pos - 1
|
20
|
+
start_markup(name)
|
15
21
|
end
|
16
22
|
|
17
23
|
def end_element(name)
|
18
|
-
|
19
|
-
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
20
|
-
@indicies << @pos - 1 unless @pos == @source.bytesize
|
24
|
+
end_markup(name)
|
21
25
|
end
|
22
26
|
|
23
27
|
def attr(name, value)
|
@@ -33,22 +37,29 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
33
37
|
@indicies << @pos - 1
|
34
38
|
end
|
35
39
|
|
36
|
-
# rubocop:disable Metrics/AbcSize
|
37
40
|
def tokens
|
38
|
-
@tokens ||=
|
39
|
-
|
41
|
+
@tokens ||= token_sequences_joined.tap { |tokens| make_sentences_from_last_token(tokens) }
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def token_sequences_joined
|
47
|
+
raw_tokens.each_with_object([]) do |token, tokens|
|
48
|
+
if tokens.empty? # Initial state
|
40
49
|
tokens << token
|
41
|
-
elsif tokens.last[1] == token[1]
|
50
|
+
elsif tokens.last[1] == token[1] # Join series of tokens of the same type into one
|
42
51
|
tokens.last[0].concat(token[0])
|
43
|
-
else
|
44
|
-
|
52
|
+
else # If token before :markup is :text we need to split it into sentences
|
53
|
+
make_sentences_from_last_token(tokens)
|
45
54
|
tokens << token
|
46
55
|
end
|
47
56
|
end
|
48
57
|
end
|
49
|
-
# rubocop:enable Metrics/AbcSize
|
50
58
|
|
51
|
-
|
59
|
+
def make_sentences_from_last_token(tokens)
|
60
|
+
return if tokens.empty?
|
61
|
+
tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
|
62
|
+
end
|
52
63
|
|
53
64
|
# rubocop: disable Metrics/MethodLength
|
54
65
|
def sentences(value)
|
@@ -91,13 +102,24 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
91
102
|
@sequence[-2] == :notranslate && name == :span
|
92
103
|
end
|
93
104
|
|
105
|
+
def start_markup(name)
|
106
|
+
@context << name
|
107
|
+
@sequence << :markup
|
108
|
+
@indicies << @pos - 1
|
109
|
+
end
|
110
|
+
|
111
|
+
def end_markup(name)
|
112
|
+
@context.pop
|
113
|
+
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
114
|
+
@indicies << @pos - 1 unless @pos == @source.bytesize
|
115
|
+
end
|
116
|
+
|
94
117
|
class << self
|
95
118
|
def tokenize(value)
|
96
119
|
return [] if value.nil?
|
97
120
|
tokenizer = new(value).tap do |h|
|
98
121
|
Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
|
99
122
|
end
|
100
|
-
puts tokenizer.tokens.inspect
|
101
123
|
tokenizer.tokens
|
102
124
|
end
|
103
125
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|