konjak 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/konjak/segment.rb +2 -2
- data/lib/konjak/tmx_segmentor/strategy.rb +71 -69
- data/lib/konjak/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 768f8768ce6cf9b74a7efdbc099fe192d8315263
|
4
|
+
data.tar.gz: 6ea6d582ac4df35043317bc274bf9391716fe5f3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90a225fc3be21ca3a273ff7304c5c803d72ed372449a53143090909b48e4272ba8cf9f47bc0522dc502e50582ebcd0c76c3dffce4ef605c7322e3448c9a0ab48
|
7
|
+
data.tar.gz: cdf1f709324975a459b808e7c855efcfa981cee5f96839905ab3744cd52fa80a7247f199957617059b1f509211582aa32558c1a8e666b827ae303a503f29f6b2
|
data/lib/konjak/segment.rb
CHANGED
@@ -23,8 +23,8 @@ module Konjak
|
|
23
23
|
|
24
24
|
def compile_pattern
|
25
25
|
regexp = Regexp.escape(text)
|
26
|
-
regexp.gsub!(/(?<!^)\\\s/)
|
27
|
-
regexp.gsub!(/(?<!^)(?:\\s)
|
26
|
+
regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
|
27
|
+
regexp.gsub!(/(?<!^)(?:\\s)+/) {|s| s + POSSESSIVE_QUALIFIER }
|
28
28
|
Regexp.compile(regexp)
|
29
29
|
end
|
30
30
|
|
@@ -16,22 +16,81 @@ module Konjak
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def segmentize(text)
|
19
|
-
|
20
|
-
translation_units(text).each do |translation_unit|
|
21
|
-
segment = translation_unit.variant(@lang).segment
|
19
|
+
range_segment_pairs = []
|
22
20
|
|
23
|
-
|
21
|
+
translation_units.each {|tu|
|
22
|
+
segment = tu.variant(@lang).segment
|
23
|
+
text.scan(compile_pattern(segment)) {
|
24
|
+
range_segment_pairs << [($~.begin(0)...$~.end(0)), segment]
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
# Can't split text
|
29
|
+
return [text] if range_segment_pairs.empty?
|
30
|
+
|
31
|
+
range_segment_pairs.uniq! {|rsp| [rsp[0], rsp[1].text] }
|
32
|
+
range_segment_pairs.sort_by! {|(m, s)|
|
33
|
+
[m.begin, -s.text.size]
|
34
|
+
}
|
24
35
|
|
25
|
-
|
26
|
-
next text if text.length < min_segment_length
|
27
|
-
next text if text.is_a?(SegmentString)
|
36
|
+
max_weight_range_segments = max_weight_range_segments(range_segment_pairs)
|
28
37
|
|
29
|
-
|
30
|
-
|
38
|
+
segments = []
|
39
|
+
prev_text_index = 0
|
40
|
+
max_weight_range_segments.each do |(range, segment)|
|
41
|
+
prev_text = text[prev_text_index...range.begin]
|
42
|
+
|
43
|
+
segments << prev_text unless prev_text.empty?
|
44
|
+
|
45
|
+
segments << SegmentString.new(text[range.begin, range.size], segment)
|
46
|
+
|
47
|
+
prev_text_index = range.end
|
31
48
|
end
|
49
|
+
after_text = text[prev_text_index..-1]
|
50
|
+
segments << after_text unless after_text.empty?
|
32
51
|
segments
|
33
52
|
end
|
34
53
|
|
54
|
+
|
55
|
+
def max_weight_range_segments(range_segment_pairs)
|
56
|
+
edges = []
|
57
|
+
prev_nodes = Array.new(range_segment_pairs.size, -1)
|
58
|
+
weights = range_segment_pairs.map {|rsp| rsp[0].size }
|
59
|
+
|
60
|
+
range_segment_pairs.each_with_index do |rsp, rsp_i|
|
61
|
+
((rsp_i + 1)...range_segment_pairs.size).each do |rsp2_i|
|
62
|
+
rsp2 = range_segment_pairs[rsp2_i]
|
63
|
+
|
64
|
+
next if rsp2[0].begin < rsp[0].end
|
65
|
+
|
66
|
+
edges << [rsp_i, rsp2_i]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
edges.each do |(rsp_i, rsp2_i)|
|
71
|
+
new_rsp2_weight = weights[rsp_i] + range_segment_pairs[rsp2_i][0].size
|
72
|
+
|
73
|
+
if weights[rsp2_i] < new_rsp2_weight
|
74
|
+
weights[rsp2_i] = new_rsp2_weight
|
75
|
+
prev_nodes[rsp2_i] = rsp_i
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
node_index = weights.index(weights.max)
|
80
|
+
|
81
|
+
max_weight_range_segment_indexes = Enumerator.new {|y|
|
82
|
+
loop do
|
83
|
+
break if node_index == -1
|
84
|
+
y << node_index
|
85
|
+
node_index = prev_nodes[node_index]
|
86
|
+
end
|
87
|
+
}.to_a.reverse
|
88
|
+
|
89
|
+
max_weight_range_segment_indexes.map {|i|
|
90
|
+
range_segment_pairs[i]
|
91
|
+
}
|
92
|
+
end
|
93
|
+
|
35
94
|
private
|
36
95
|
|
37
96
|
def default_options
|
@@ -49,73 +108,16 @@ module Konjak
|
|
49
108
|
@options[:max_segment_length]
|
50
109
|
end
|
51
110
|
|
52
|
-
def
|
53
|
-
|
54
|
-
while true
|
55
|
-
break if text.length < min_segment_length
|
56
|
-
|
57
|
-
break unless text =~ pat
|
58
|
-
|
59
|
-
head = $`
|
60
|
-
match = $&
|
61
|
-
tail = $'
|
62
|
-
|
63
|
-
texts << head unless head.empty?
|
64
|
-
|
65
|
-
texts << SegmentString.new(match, segment)
|
66
|
-
|
67
|
-
text = tail
|
68
|
-
end
|
69
|
-
texts << text
|
70
|
-
end
|
71
|
-
|
72
|
-
def translation_units(text)
|
73
|
-
tus = @tmx.body.translation_units
|
74
|
-
|
75
|
-
tus.select! {|tu|
|
111
|
+
def translation_units
|
112
|
+
@translation_units ||= @tmx.body.translation_units.select {|tu|
|
76
113
|
segment = tu.variant(@lang).segment
|
77
114
|
segment_length = segment.text.length
|
78
115
|
|
79
116
|
next false if segment_length < min_segment_length
|
80
117
|
next false if max_segment_length && max_segment_length < segment_length
|
81
118
|
|
82
|
-
|
119
|
+
true
|
83
120
|
}
|
84
|
-
|
85
|
-
simular_translation_units_map = {}
|
86
|
-
|
87
|
-
tus.sort_by! {|tu|
|
88
|
-
tu_segment = tu.variant(@lang).segment
|
89
|
-
segment_text = tu_segment.text
|
90
|
-
|
91
|
-
unless simular_translation_units_map[segment_text]
|
92
|
-
simular_translation_units = tus.select {|tu2|
|
93
|
-
tu2.variant(@lang).segment.text.include?(segment_text)
|
94
|
-
}.sort_by! {|tu2| tu2.variant(@lang).segment.text.size }
|
95
|
-
|
96
|
-
simular_translation_units.each do |tu2|
|
97
|
-
simular_translation_units_map[tu2.variant(@lang).segment.text] = simular_translation_units
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
rank = simular_translation_units_map[segment_text].index {|tu2|
|
102
|
-
tu2.variant(@lang).segment.text == segment_text
|
103
|
-
}
|
104
|
-
|
105
|
-
# GTTの場合
|
106
|
-
translation_timestamp = nil
|
107
|
-
if tm_entry = tu.at('entry_metadata').try(:at, 'tm_entry')
|
108
|
-
source_info = tm_entry.at('source_info')
|
109
|
-
if source_info.try(:at, 'source_lang').try(:text) == @lang && source_info.try(:at, 'source').try(:text) == segment_text
|
110
|
-
translation_timestamp = tm_entry.at('translation').try(:attr, 'translation_timestamp').to_i
|
111
|
-
end
|
112
|
-
end
|
113
|
-
translation_timestamp ||= 0
|
114
|
-
|
115
|
-
[-rank, -translation_timestamp, -segment_text.length]
|
116
|
-
}
|
117
|
-
|
118
|
-
tus
|
119
121
|
end
|
120
122
|
end
|
121
123
|
end
|
data/lib/konjak/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: konjak
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Seiei Higa
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|