konjak 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/konjak/segment/gtt.rb +12 -4
- data/lib/konjak/segment.rb +7 -1
- data/lib/konjak/tmx_segmentor/gtt_html_strategy.rb +2 -15
- data/lib/konjak/tmx_segmentor/strategy.rb +21 -1
- data/lib/konjak/tmx_segmentor/text_strategy.rb +2 -15
- data/lib/konjak/version.rb +1 -1
- data/spec/konjak_translate_spec.rb +5 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd181dadc5d9128c27be3312f50274acffbcfce8
|
4
|
+
data.tar.gz: 29fa12fb01c4409b7587496413b0fda0c46ab0a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 188010cd203698454c7f2204c20e6a6e19a18a7eb25640d0c9555e011568ff87fe96ba521ab04c1c42975c59636e634992144a825980f9398abb12e72f913996
|
7
|
+
data.tar.gz: ba9cf03f27fe8599815060771ea437fb852b95e0a5f3f1e06524e9ef0c5930b0a373c9fca08cbffd83c44b5238acd51403058775a83fe328a3550c7424cd88ee
|
data/lib/konjak/segment/gtt.rb
CHANGED
@@ -1,18 +1,26 @@
|
|
1
|
+
require 'mem'
|
2
|
+
|
1
3
|
module Konjak
|
2
4
|
class Segment < StructuralElement
|
3
5
|
module GTT
|
6
|
+
include Mem
|
7
|
+
|
4
8
|
Tag = Struct.new(:gtt, :html)
|
5
9
|
|
6
10
|
def compile_gtt_html_pattern
|
7
11
|
regexp = Regexp.escape(text)
|
8
12
|
gtt_tag_ns.each do |n|
|
9
|
-
regexp
|
10
|
-
regexp
|
11
|
-
regexp
|
13
|
+
regexp.sub!(/\\\{#{n}\\\}/) { "(?<n#{n}><(?<_#{n}>\\w+)[^>]*>)" }
|
14
|
+
regexp.gsub!(/\\\{#{n}\\\}/) { "\\k<n#{n}>" }
|
15
|
+
regexp.gsub!(/\\\{\/#{n}\\\}/) { "</\\k<_#{n}>>" }
|
12
16
|
end
|
13
|
-
regexp
|
17
|
+
regexp.gsub!(/(?:\\\s|\n)/m) { '\s' }
|
18
|
+
regexp.gsub!(/(?:\\s)+/m) {|s| s + '++' }
|
19
|
+
regexp.gsub!(/^(?<s>(?:\\s)+)\+\+/) { $~[:s] }
|
20
|
+
regexp.gsub!(/(?<s>(?:\\s)+)\+\+$/) { $~[:s] }
|
14
21
|
Regexp.compile(regexp)
|
15
22
|
end
|
23
|
+
memoize :compile_gtt_html_pattern
|
16
24
|
|
17
25
|
def extract_gtt_tags_from(text)
|
18
26
|
m = text.match(compile_gtt_html_pattern)
|
data/lib/konjak/segment.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
require 'konjak/segment/gtt'
|
2
|
+
require 'mem'
|
2
3
|
|
3
4
|
module Konjak
|
4
5
|
# container
|
5
6
|
class Segment < StructuralElement
|
6
7
|
include GTT
|
8
|
+
include Mem
|
7
9
|
|
8
10
|
# children
|
9
11
|
def text
|
@@ -17,9 +19,13 @@ module Konjak
|
|
17
19
|
|
18
20
|
def compile_pattern
|
19
21
|
regexp = Regexp.escape(text)
|
20
|
-
regexp = regexp.gsub(/(?:\\\s|\n)/m)
|
22
|
+
regexp = regexp.gsub(/(?:\\\s|\n)/m) { '\s' }
|
23
|
+
regexp = regexp.gsub(/(?:\\s)+/m) {|s| s + '++' }
|
24
|
+
regexp = regexp.gsub(/^(?<s>(?:\\s)+)\+\+/) { $~[:s] }
|
25
|
+
regexp = regexp.gsub(/(?<s>(?:\\s)+)\+\+$/) { $~[:s] }
|
21
26
|
Regexp.compile(regexp)
|
22
27
|
end
|
28
|
+
memoize :compile_pattern
|
23
29
|
|
24
30
|
def translation_unit
|
25
31
|
TranslationUnit.new(translation_unit_variant.parent)
|
@@ -7,21 +7,8 @@ module Konjak
|
|
7
7
|
|
8
8
|
private
|
9
9
|
|
10
|
-
def
|
11
|
-
segment
|
12
|
-
pattern = segment.compile_gtt_html_pattern
|
13
|
-
|
14
|
-
texts = []
|
15
|
-
while true
|
16
|
-
head, match, tail = text.partition(pattern)
|
17
|
-
break if match.empty? || text.length < min_segment_length
|
18
|
-
texts << head unless head.empty?
|
19
|
-
|
20
|
-
texts << SegmentString.new(match, segment)
|
21
|
-
|
22
|
-
text = tail
|
23
|
-
end
|
24
|
-
texts << text
|
10
|
+
def compile_pattern(segment)
|
11
|
+
segment.compile_gtt_html_pattern
|
25
12
|
end
|
26
13
|
end
|
27
14
|
end
|
@@ -16,10 +16,13 @@ module Konjak
|
|
16
16
|
def segmentize(text)
|
17
17
|
segments = [text]
|
18
18
|
translation_units.each do |translation_unit|
|
19
|
+
segment = translation_unit.variant(@lang).segment
|
20
|
+
|
19
21
|
segments.map! {|text|
|
22
|
+
next text if text.length < min_segment_length
|
20
23
|
next text if text.is_a?(SegmentString)
|
21
24
|
|
22
|
-
split(
|
25
|
+
split(segment, text)
|
23
26
|
}.flatten!
|
24
27
|
end
|
25
28
|
segments
|
@@ -35,6 +38,23 @@ module Konjak
|
|
35
38
|
@options[:min_segment_length]
|
36
39
|
end
|
37
40
|
|
41
|
+
def split(segment, text)
|
42
|
+
texts = []
|
43
|
+
while true
|
44
|
+
break if text.length < min_segment_length
|
45
|
+
|
46
|
+
head, match, tail = text.partition(compile_pattern(segment))
|
47
|
+
break if match.empty?
|
48
|
+
|
49
|
+
texts << head unless head.empty?
|
50
|
+
|
51
|
+
texts << SegmentString.new(match, segment)
|
52
|
+
|
53
|
+
text = tail
|
54
|
+
end
|
55
|
+
texts << text
|
56
|
+
end
|
57
|
+
|
38
58
|
def translation_units
|
39
59
|
@tmx.body.translation_units.sort_by {|tu|
|
40
60
|
-tu.variant(@lang).segment.text.length
|
@@ -7,21 +7,8 @@ module Konjak
|
|
7
7
|
|
8
8
|
private
|
9
9
|
|
10
|
-
def
|
11
|
-
segment
|
12
|
-
pattern = segment.compile_pattern
|
13
|
-
|
14
|
-
texts = []
|
15
|
-
while true
|
16
|
-
head, match, tail = text.partition(pattern)
|
17
|
-
break if match.empty? || text.length < min_segment_length
|
18
|
-
texts << head unless head.empty?
|
19
|
-
|
20
|
-
texts << SegmentString.new(match, segment)
|
21
|
-
|
22
|
-
text = tail
|
23
|
-
end
|
24
|
-
texts << text
|
10
|
+
def compile_pattern(segment)
|
11
|
+
segment.compile_pattern
|
25
12
|
end
|
26
13
|
end
|
27
14
|
end
|
data/lib/konjak/version.rb
CHANGED
@@ -19,8 +19,10 @@ EXPECT
|
|
19
19
|
|
20
20
|
context 'when blanks between words is not exactly match' do
|
21
21
|
let(:doc) { <<DOC }
|
22
|
-
this is data
|
23
|
-
|
22
|
+
this is data (with a non-standard
|
23
|
+
|
24
|
+
|
25
|
+
character: ).
|
24
26
|
this is data (with a non-standard character: ).
|
25
27
|
DOC
|
26
28
|
|
@@ -54,7 +56,7 @@ EXPECT
|
|
54
56
|
context 'when blanks between words is not exactly match' do
|
55
57
|
let(:doc) { <<GTT_HTML }
|
56
58
|
This is
|
57
|
-
<a href="http://example.com">example</a>.
|
59
|
+
<a href="http://example.com">example</a>.
|
58
60
|
And This
|
59
61
|
is
|
60
62
|
<b>example</b>. Yey.
|