konjak 0.0.18 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/konjak/segment.rb +2 -2
- data/lib/konjak/tmx_segmentor/strategy.rb +71 -69
- data/lib/konjak/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 768f8768ce6cf9b74a7efdbc099fe192d8315263
|
4
|
+
data.tar.gz: 6ea6d582ac4df35043317bc274bf9391716fe5f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90a225fc3be21ca3a273ff7304c5c803d72ed372449a53143090909b48e4272ba8cf9f47bc0522dc502e50582ebcd0c76c3dffce4ef605c7322e3448c9a0ab48
|
7
|
+
data.tar.gz: cdf1f709324975a459b808e7c855efcfa981cee5f96839905ab3744cd52fa80a7247f199957617059b1f509211582aa32558c1a8e666b827ae303a503f29f6b2
|
data/lib/konjak/segment.rb
CHANGED
@@ -23,8 +23,8 @@ module Konjak
|
|
23
23
|
|
24
24
|
def compile_pattern
|
25
25
|
regexp = Regexp.escape(text)
|
26
|
-
regexp.gsub!(/(?<!^)\\\s/)
|
27
|
-
regexp.gsub!(/(?<!^)(?:\\s)
|
26
|
+
regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
|
27
|
+
regexp.gsub!(/(?<!^)(?:\\s)+/) {|s| s + POSSESSIVE_QUALIFIER }
|
28
28
|
Regexp.compile(regexp)
|
29
29
|
end
|
30
30
|
|
@@ -16,22 +16,81 @@ module Konjak
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def segmentize(text)
|
19
|
-
|
20
|
-
translation_units(text).each do |translation_unit|
|
21
|
-
segment = translation_unit.variant(@lang).segment
|
19
|
+
range_segment_pairs = []
|
22
20
|
|
23
|
-
|
21
|
+
translation_units.each {|tu|
|
22
|
+
segment = tu.variant(@lang).segment
|
23
|
+
text.scan(compile_pattern(segment)) {
|
24
|
+
range_segment_pairs << [($~.begin(0)...$~.end(0)), segment]
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
# Can't split text
|
29
|
+
return [text] if range_segment_pairs.empty?
|
30
|
+
|
31
|
+
range_segment_pairs.uniq! {|rsp| [rsp[0], rsp[1].text] }
|
32
|
+
range_segment_pairs.sort_by! {|(m, s)|
|
33
|
+
[m.begin, -s.text.size]
|
34
|
+
}
|
24
35
|
|
25
|
-
|
26
|
-
next text if text.length < min_segment_length
|
27
|
-
next text if text.is_a?(SegmentString)
|
36
|
+
max_weight_range_segments = max_weight_range_segments(range_segment_pairs)
|
28
37
|
|
29
|
-
|
30
|
-
|
38
|
+
segments = []
|
39
|
+
prev_text_index = 0
|
40
|
+
max_weight_range_segments.each do |(range, segment)|
|
41
|
+
prev_text = text[prev_text_index...range.begin]
|
42
|
+
|
43
|
+
segments << prev_text unless prev_text.empty?
|
44
|
+
|
45
|
+
segments << SegmentString.new(text[range.begin, range.size], segment)
|
46
|
+
|
47
|
+
prev_text_index = range.end
|
31
48
|
end
|
49
|
+
after_text = text[prev_text_index..-1]
|
50
|
+
segments << after_text unless after_text.empty?
|
32
51
|
segments
|
33
52
|
end
|
34
53
|
|
54
|
+
|
55
|
+
def max_weight_range_segments(range_segment_pairs)
|
56
|
+
edges = []
|
57
|
+
prev_nodes = Array.new(range_segment_pairs.size, -1)
|
58
|
+
weights = range_segment_pairs.map {|rsp| rsp[0].size }
|
59
|
+
|
60
|
+
range_segment_pairs.each_with_index do |rsp, rsp_i|
|
61
|
+
((rsp_i + 1)...range_segment_pairs.size).each do |rsp2_i|
|
62
|
+
rsp2 = range_segment_pairs[rsp2_i]
|
63
|
+
|
64
|
+
next if rsp2[0].begin < rsp[0].end
|
65
|
+
|
66
|
+
edges << [rsp_i, rsp2_i]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
edges.each do |(rsp_i, rsp2_i)|
|
71
|
+
new_rsp2_weight = weights[rsp_i] + range_segment_pairs[rsp2_i][0].size
|
72
|
+
|
73
|
+
if weights[rsp2_i] < new_rsp2_weight
|
74
|
+
weights[rsp2_i] = new_rsp2_weight
|
75
|
+
prev_nodes[rsp2_i] = rsp_i
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
node_index = weights.index(weights.max)
|
80
|
+
|
81
|
+
max_weight_range_segment_indexes = Enumerator.new {|y|
|
82
|
+
loop do
|
83
|
+
break if node_index == -1
|
84
|
+
y << node_index
|
85
|
+
node_index = prev_nodes[node_index]
|
86
|
+
end
|
87
|
+
}.to_a.reverse
|
88
|
+
|
89
|
+
max_weight_range_segment_indexes.map {|i|
|
90
|
+
range_segment_pairs[i]
|
91
|
+
}
|
92
|
+
end
|
93
|
+
|
35
94
|
private
|
36
95
|
|
37
96
|
def default_options
|
@@ -49,73 +108,16 @@ module Konjak
|
|
49
108
|
@options[:max_segment_length]
|
50
109
|
end
|
51
110
|
|
52
|
-
def
|
53
|
-
|
54
|
-
while true
|
55
|
-
break if text.length < min_segment_length
|
56
|
-
|
57
|
-
break unless text =~ pat
|
58
|
-
|
59
|
-
head = $`
|
60
|
-
match = $&
|
61
|
-
tail = $'
|
62
|
-
|
63
|
-
texts << head unless head.empty?
|
64
|
-
|
65
|
-
texts << SegmentString.new(match, segment)
|
66
|
-
|
67
|
-
text = tail
|
68
|
-
end
|
69
|
-
texts << text
|
70
|
-
end
|
71
|
-
|
72
|
-
def translation_units(text)
|
73
|
-
tus = @tmx.body.translation_units
|
74
|
-
|
75
|
-
tus.select! {|tu|
|
111
|
+
def translation_units
|
112
|
+
@translation_units ||= @tmx.body.translation_units.select {|tu|
|
76
113
|
segment = tu.variant(@lang).segment
|
77
114
|
segment_length = segment.text.length
|
78
115
|
|
79
116
|
next false if segment_length < min_segment_length
|
80
117
|
next false if max_segment_length && max_segment_length < segment_length
|
81
118
|
|
82
|
-
|
119
|
+
true
|
83
120
|
}
|
84
|
-
|
85
|
-
simular_translation_units_map = {}
|
86
|
-
|
87
|
-
tus.sort_by! {|tu|
|
88
|
-
tu_segment = tu.variant(@lang).segment
|
89
|
-
segment_text = tu_segment.text
|
90
|
-
|
91
|
-
unless simular_translation_units_map[segment_text]
|
92
|
-
simular_translation_units = tus.select {|tu2|
|
93
|
-
tu2.variant(@lang).segment.text.include?(segment_text)
|
94
|
-
}.sort_by! {|tu2| tu2.variant(@lang).segment.text.size }
|
95
|
-
|
96
|
-
simular_translation_units.each do |tu2|
|
97
|
-
simular_translation_units_map[tu2.variant(@lang).segment.text] = simular_translation_units
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
rank = simular_translation_units_map[segment_text].index {|tu2|
|
102
|
-
tu2.variant(@lang).segment.text == segment_text
|
103
|
-
}
|
104
|
-
|
105
|
-
# GTTの場合
|
106
|
-
translation_timestamp = nil
|
107
|
-
if tm_entry = tu.at('entry_metadata').try(:at, 'tm_entry')
|
108
|
-
source_info = tm_entry.at('source_info')
|
109
|
-
if source_info.try(:at, 'source_lang').try(:text) == @lang && source_info.try(:at, 'source').try(:text) == segment_text
|
110
|
-
translation_timestamp = tm_entry.at('translation').try(:attr, 'translation_timestamp').to_i
|
111
|
-
end
|
112
|
-
end
|
113
|
-
translation_timestamp ||= 0
|
114
|
-
|
115
|
-
[-rank, -translation_timestamp, -segment_text.length]
|
116
|
-
}
|
117
|
-
|
118
|
-
tus
|
119
121
|
end
|
120
122
|
end
|
121
123
|
end
|
data/lib/konjak/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: konjak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seiei Higa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|