google_translate_diff 1.0.6 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/google_translate_diff.gemspec +1 -1
- data/lib/google_translate_diff/tokenizer.rb +50 -22
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f06bc3ed14da7bb671f3afc188375f72da6b1026
|
4
|
+
data.tar.gz: 62af00a1a43df3f8c1773346709c3f09e95cf186
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2662f618fc76e10877f78ca07dbce8bfda0b0f2c08da06643ea2d91491354631d15d44137f5fb76a33770fc59becb442471092fd030281e28c17d05722e7f1c
|
7
|
+
data.tar.gz: 39ddf3a505c96560016e24a9f3bd303e8e36556894e7aea7c612ce6c9078266ef1e4b7756f807e256507bf40105bcc226bab80f65273798b004b4756f8b0dd31
|
@@ -44,7 +44,7 @@ between revisions of long texts.
|
|
44
44
|
spec.add_development_dependency "simplecov"
|
45
45
|
|
46
46
|
spec.add_dependency "google-cloud-translate"
|
47
|
-
spec.add_dependency "ox"
|
47
|
+
spec.add_dependency "ox"
|
48
48
|
spec.add_dependency "dry-initializer"
|
49
49
|
spec.add_dependency "punkt-segmenter"
|
50
50
|
end
|
@@ -1,40 +1,58 @@
|
|
1
1
|
class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
2
2
|
def initialize(source)
|
3
3
|
@pos = nil
|
4
|
-
@prev = 1
|
5
|
-
@skip = false
|
6
4
|
@source = source
|
7
5
|
@tokens = []
|
6
|
+
@context = []
|
7
|
+
@sequence = []
|
8
|
+
@indicies = []
|
8
9
|
end
|
9
10
|
|
10
|
-
attr_reader :texts, :tokens, :prev, :pos
|
11
|
-
|
12
11
|
def start_element(name)
|
13
|
-
@
|
12
|
+
@context << name
|
13
|
+
@sequence << :markup
|
14
|
+
@indicies << @pos - 1
|
14
15
|
end
|
15
16
|
|
16
17
|
def end_element(name)
|
17
|
-
@
|
18
|
+
@context.pop
|
19
|
+
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
20
|
+
@indicies << @pos - 1 unless @pos == @source.bytesize
|
18
21
|
end
|
19
22
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@tokens.concat(sentences(value))
|
23
|
+
def attr(name, value)
|
24
|
+
unless @context.last == :span && name == :class && value == "notranslate"
|
25
|
+
return
|
26
|
+
end
|
27
|
+
@sequence[-1] = :notranslate
|
28
|
+
end
|
27
29
|
|
28
|
-
|
30
|
+
def text(_)
|
31
|
+
@sequence << (SKIP.include?(@context.last) ? :markup : :text)
|
32
|
+
@indicies << @pos - 1
|
29
33
|
end
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
# rubocop:disable Metrics/AbcSize
|
36
|
+
def tokens
|
37
|
+
raw_tokens.each_with_object([]) do |token, tokens|
|
38
|
+
if tokens.empty?
|
39
|
+
tokens << token
|
40
|
+
elsif tokens.last[1] == token[1]
|
41
|
+
tokens.last[0].concat(token[0])
|
42
|
+
else
|
43
|
+
tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
|
44
|
+
tokens << token
|
45
|
+
end
|
46
|
+
end
|
34
47
|
end
|
48
|
+
# rubocop:enable Metrics/AbcSize
|
35
49
|
|
36
|
-
|
50
|
+
private
|
51
|
+
|
52
|
+
# rubocop: disable Metrics/MethodLength
|
37
53
|
def sentences(value)
|
54
|
+
return [] if value.empty?
|
55
|
+
|
38
56
|
boundaries =
|
39
57
|
Punkt::SentenceTokenizer
|
40
58
|
.new(value)
|
@@ -49,10 +67,17 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
49
67
|
[value[left..right], :text]
|
50
68
|
end
|
51
69
|
end
|
70
|
+
# rubocop:enable Metrics/MethodLength
|
52
71
|
|
53
|
-
def
|
54
|
-
|
55
|
-
|
72
|
+
def raw_tokens
|
73
|
+
@indicies.map.with_index do |i, n|
|
74
|
+
first = i
|
75
|
+
last = (@indicies[n + 1] || 0) - 1
|
76
|
+
value = fix_utf(@source.byteslice(first..last))
|
77
|
+
type = @sequence[n]
|
78
|
+
type = :text if type == :notranslate
|
79
|
+
[value, type]
|
80
|
+
end
|
56
81
|
end
|
57
82
|
|
58
83
|
def fix_utf(value)
|
@@ -61,12 +86,15 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
61
86
|
)
|
62
87
|
end
|
63
88
|
|
89
|
+
def nontranslate?(name)
|
90
|
+
@sequence[-2] == :notranslate && name == :span
|
91
|
+
end
|
92
|
+
|
64
93
|
class << self
|
65
94
|
def tokenize(value)
|
66
95
|
return [] if value.nil?
|
67
96
|
tokenizer = new(value).tap do |h|
|
68
97
|
Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
|
69
|
-
h.cut_last_token
|
70
98
|
end
|
71
99
|
tokenizer.tokens
|
72
100
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|