google_translate_diff 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/google_translate_diff.gemspec +1 -1
- data/lib/google_translate_diff/tokenizer.rb +50 -22
- data/lib/google_translate_diff/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f06bc3ed14da7bb671f3afc188375f72da6b1026
|
4
|
+
data.tar.gz: 62af00a1a43df3f8c1773346709c3f09e95cf186
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2662f618fc76e10877f78ca07dbce8bfda0b0f2c08da06643ea2d91491354631d15d44137f5fb76a33770fc59becb442471092fd030281e28c17d05722e7f1c
|
7
|
+
data.tar.gz: 39ddf3a505c96560016e24a9f3bd303e8e36556894e7aea7c612ce6c9078266ef1e4b7756f807e256507bf40105bcc226bab80f65273798b004b4756f8b0dd31
|
@@ -44,7 +44,7 @@ between revisions of long texts.
|
|
44
44
|
spec.add_development_dependency "simplecov"
|
45
45
|
|
46
46
|
spec.add_dependency "google-cloud-translate"
|
47
|
-
spec.add_dependency "ox"
|
47
|
+
spec.add_dependency "ox"
|
48
48
|
spec.add_dependency "dry-initializer"
|
49
49
|
spec.add_dependency "punkt-segmenter"
|
50
50
|
end
|
@@ -1,40 +1,58 @@
|
|
1
1
|
class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
2
2
|
def initialize(source)
|
3
3
|
@pos = nil
|
4
|
-
@prev = 1
|
5
|
-
@skip = false
|
6
4
|
@source = source
|
7
5
|
@tokens = []
|
6
|
+
@context = []
|
7
|
+
@sequence = []
|
8
|
+
@indicies = []
|
8
9
|
end
|
9
10
|
|
10
|
-
attr_reader :texts, :tokens, :prev, :pos
|
11
|
-
|
12
11
|
def start_element(name)
|
13
|
-
@
|
12
|
+
@context << name
|
13
|
+
@sequence << :markup
|
14
|
+
@indicies << @pos - 1
|
14
15
|
end
|
15
16
|
|
16
17
|
def end_element(name)
|
17
|
-
@
|
18
|
+
@context.pop
|
19
|
+
@sequence << (nontranslate?(name) ? :notranslate : :markup)
|
20
|
+
@indicies << @pos - 1 unless @pos == @source.bytesize
|
18
21
|
end
|
19
22
|
|
20
|
-
def
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@tokens.concat(sentences(value))
|
23
|
+
def attr(name, value)
|
24
|
+
unless @context.last == :span && name == :class && value == "notranslate"
|
25
|
+
return
|
26
|
+
end
|
27
|
+
@sequence[-1] = :notranslate
|
28
|
+
end
|
27
29
|
|
28
|
-
|
30
|
+
def text(_)
|
31
|
+
@sequence << (SKIP.include?(@context.last) ? :markup : :text)
|
32
|
+
@indicies << @pos - 1
|
29
33
|
end
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
# rubocop:disable Metrics/AbcSize
|
36
|
+
def tokens
|
37
|
+
raw_tokens.each_with_object([]) do |token, tokens|
|
38
|
+
if tokens.empty?
|
39
|
+
tokens << token
|
40
|
+
elsif tokens.last[1] == token[1]
|
41
|
+
tokens.last[0].concat(token[0])
|
42
|
+
else
|
43
|
+
tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
|
44
|
+
tokens << token
|
45
|
+
end
|
46
|
+
end
|
34
47
|
end
|
48
|
+
# rubocop:enable Metrics/AbcSize
|
35
49
|
|
36
|
-
|
50
|
+
private
|
51
|
+
|
52
|
+
# rubocop: disable Metrics/MethodLength
|
37
53
|
def sentences(value)
|
54
|
+
return [] if value.empty?
|
55
|
+
|
38
56
|
boundaries =
|
39
57
|
Punkt::SentenceTokenizer
|
40
58
|
.new(value)
|
@@ -49,10 +67,17 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
49
67
|
[value[left..right], :text]
|
50
68
|
end
|
51
69
|
end
|
70
|
+
# rubocop:enable Metrics/MethodLength
|
52
71
|
|
53
|
-
def
|
54
|
-
|
55
|
-
|
72
|
+
def raw_tokens
|
73
|
+
@indicies.map.with_index do |i, n|
|
74
|
+
first = i
|
75
|
+
last = (@indicies[n + 1] || 0) - 1
|
76
|
+
value = fix_utf(@source.byteslice(first..last))
|
77
|
+
type = @sequence[n]
|
78
|
+
type = :text if type == :notranslate
|
79
|
+
[value, type]
|
80
|
+
end
|
56
81
|
end
|
57
82
|
|
58
83
|
def fix_utf(value)
|
@@ -61,12 +86,15 @@ class GoogleTranslateDiff::Tokenizer < ::Ox::Sax
|
|
61
86
|
)
|
62
87
|
end
|
63
88
|
|
89
|
+
def nontranslate?(name)
|
90
|
+
@sequence[-2] == :notranslate && name == :span
|
91
|
+
end
|
92
|
+
|
64
93
|
class << self
|
65
94
|
def tokenize(value)
|
66
95
|
return [] if value.nil?
|
67
96
|
tokenizer = new(value).tap do |h|
|
68
97
|
Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
|
69
|
-
h.cut_last_token
|
70
98
|
end
|
71
99
|
tokenizer.tokens
|
72
100
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_translate_diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Sokolov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-08-
|
11
|
+
date: 2017-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|