konjak 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c4ffc2d0548951d62d07f5612bd4399e8a8f6519
4
- data.tar.gz: 77a83feb0b806e2e6a87b56246c7062fa175f364
3
+ metadata.gz: 768f8768ce6cf9b74a7efdbc099fe192d8315263
4
+ data.tar.gz: 6ea6d582ac4df35043317bc274bf9391716fe5f3
5
5
  SHA512:
6
- metadata.gz: c92f4f5d4328b6fe29e5a92451b7e648796c37ec7c044a58ecfbb70e7620cf67494fce6a8052bbce6901caa85a529a5d35001ccafe236c7dd0b97efcd7e399d3
7
- data.tar.gz: 995318581a91f8fbc41ddf97258777ba3942e6e7c69ab1b303208be8d7f1aecca44c2b0561c1a10adc70f5055e583223f97ad8fcca7787a8165f85ff37f8a8e8
6
+ metadata.gz: 90a225fc3be21ca3a273ff7304c5c803d72ed372449a53143090909b48e4272ba8cf9f47bc0522dc502e50582ebcd0c76c3dffce4ef605c7322e3448c9a0ab48
7
+ data.tar.gz: cdf1f709324975a459b808e7c855efcfa981cee5f96839905ab3744cd52fa80a7247f199957617059b1f509211582aa32558c1a8e666b827ae303a503f29f6b2
@@ -23,8 +23,8 @@ module Konjak
23
23
 
24
24
  def compile_pattern
25
25
  regexp = Regexp.escape(text)
26
- regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
27
- regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s + POSSESSIVE_QUALIFIER }
26
+ regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
27
+ regexp.gsub!(/(?<!^)(?:\\s)+/) {|s| s + POSSESSIVE_QUALIFIER }
28
28
  Regexp.compile(regexp)
29
29
  end
30
30
 
@@ -16,22 +16,81 @@ module Konjak
16
16
  end
17
17
 
18
18
  def segmentize(text)
19
- segments = [text]
20
- translation_units(text).each do |translation_unit|
21
- segment = translation_unit.variant(@lang).segment
19
+ range_segment_pairs = []
22
20
 
23
- pat = compile_pattern(segment)
21
+ translation_units.each {|tu|
22
+ segment = tu.variant(@lang).segment
23
+ text.scan(compile_pattern(segment)) {
24
+ range_segment_pairs << [($~.begin(0)...$~.end(0)), segment]
25
+ }
26
+ }
27
+
28
+ # Can't split text
29
+ return [text] if range_segment_pairs.empty?
30
+
31
+ range_segment_pairs.uniq! {|rsp| [rsp[0], rsp[1].text] }
32
+ range_segment_pairs.sort_by! {|(m, s)|
33
+ [m.begin, -s.text.size]
34
+ }
24
35
 
25
- segments.map! {|text|
26
- next text if text.length < min_segment_length
27
- next text if text.is_a?(SegmentString)
36
+ max_weight_range_segments = max_weight_range_segments(range_segment_pairs)
28
37
 
29
- split(pat, segment, text)
30
- }.flatten!
38
+ segments = []
39
+ prev_text_index = 0
40
+ max_weight_range_segments.each do |(range, segment)|
41
+ prev_text = text[prev_text_index...range.begin]
42
+
43
+ segments << prev_text unless prev_text.empty?
44
+
45
+ segments << SegmentString.new(text[range.begin, range.size], segment)
46
+
47
+ prev_text_index = range.end
31
48
  end
49
+ after_text = text[prev_text_index..-1]
50
+ segments << after_text unless after_text.empty?
32
51
  segments
33
52
  end
34
53
 
54
+
55
+ def max_weight_range_segments(range_segment_pairs)
56
+ edges = []
57
+ prev_nodes = Array.new(range_segment_pairs.size, -1)
58
+ weights = range_segment_pairs.map {|rsp| rsp[0].size }
59
+
60
+ range_segment_pairs.each_with_index do |rsp, rsp_i|
61
+ ((rsp_i + 1)...range_segment_pairs.size).each do |rsp2_i|
62
+ rsp2 = range_segment_pairs[rsp2_i]
63
+
64
+ next if rsp2[0].begin < rsp[0].end
65
+
66
+ edges << [rsp_i, rsp2_i]
67
+ end
68
+ end
69
+
70
+ edges.each do |(rsp_i, rsp2_i)|
71
+ new_rsp2_weight = weights[rsp_i] + range_segment_pairs[rsp2_i][0].size
72
+
73
+ if weights[rsp2_i] < new_rsp2_weight
74
+ weights[rsp2_i] = new_rsp2_weight
75
+ prev_nodes[rsp2_i] = rsp_i
76
+ end
77
+ end
78
+
79
+ node_index = weights.index(weights.max)
80
+
81
+ max_weight_range_segment_indexes = Enumerator.new {|y|
82
+ loop do
83
+ break if node_index == -1
84
+ y << node_index
85
+ node_index = prev_nodes[node_index]
86
+ end
87
+ }.to_a.reverse
88
+
89
+ max_weight_range_segment_indexes.map {|i|
90
+ range_segment_pairs[i]
91
+ }
92
+ end
93
+
35
94
  private
36
95
 
37
96
  def default_options
@@ -49,73 +108,16 @@ module Konjak
49
108
  @options[:max_segment_length]
50
109
  end
51
110
 
52
- def split(pat, segment, text)
53
- texts = []
54
- while true
55
- break if text.length < min_segment_length
56
-
57
- break unless text =~ pat
58
-
59
- head = $`
60
- match = $&
61
- tail = $'
62
-
63
- texts << head unless head.empty?
64
-
65
- texts << SegmentString.new(match, segment)
66
-
67
- text = tail
68
- end
69
- texts << text
70
- end
71
-
72
- def translation_units(text)
73
- tus = @tmx.body.translation_units
74
-
75
- tus.select! {|tu|
111
+ def translation_units
112
+ @translation_units ||= @tmx.body.translation_units.select {|tu|
76
113
  segment = tu.variant(@lang).segment
77
114
  segment_length = segment.text.length
78
115
 
79
116
  next false if segment_length < min_segment_length
80
117
  next false if max_segment_length && max_segment_length < segment_length
81
118
 
82
- text =~ compile_pattern(tu.variant(@lang).segment)
119
+ true
83
120
  }
84
-
85
- simular_translation_units_map = {}
86
-
87
- tus.sort_by! {|tu|
88
- tu_segment = tu.variant(@lang).segment
89
- segment_text = tu_segment.text
90
-
91
- unless simular_translation_units_map[segment_text]
92
- simular_translation_units = tus.select {|tu2|
93
- tu2.variant(@lang).segment.text.include?(segment_text)
94
- }.sort_by! {|tu2| tu2.variant(@lang).segment.text.size }
95
-
96
- simular_translation_units.each do |tu2|
97
- simular_translation_units_map[tu2.variant(@lang).segment.text] = simular_translation_units
98
- end
99
- end
100
-
101
- rank = simular_translation_units_map[segment_text].index {|tu2|
102
- tu2.variant(@lang).segment.text == segment_text
103
- }
104
-
105
- # GTTの場合
106
- translation_timestamp = nil
107
- if tm_entry = tu.at('entry_metadata').try(:at, 'tm_entry')
108
- source_info = tm_entry.at('source_info')
109
- if source_info.try(:at, 'source_lang').try(:text) == @lang && source_info.try(:at, 'source').try(:text) == segment_text
110
- translation_timestamp = tm_entry.at('translation').try(:attr, 'translation_timestamp').to_i
111
- end
112
- end
113
- translation_timestamp ||= 0
114
-
115
- [-rank, -translation_timestamp, -segment_text.length]
116
- }
117
-
118
- tus
119
121
  end
120
122
  end
121
123
  end
@@ -1,3 +1,3 @@
1
1
  module Konjak
2
- VERSION = "0.0.18"
2
+ VERSION = "0.0.19"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: konjak
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.18
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seiei Higa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-18 00:00:00.000000000 Z
11
+ date: 2015-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport