konjak 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c4ffc2d0548951d62d07f5612bd4399e8a8f6519
4
- data.tar.gz: 77a83feb0b806e2e6a87b56246c7062fa175f364
3
+ metadata.gz: 768f8768ce6cf9b74a7efdbc099fe192d8315263
4
+ data.tar.gz: 6ea6d582ac4df35043317bc274bf9391716fe5f3
5
5
  SHA512:
6
- metadata.gz: c92f4f5d4328b6fe29e5a92451b7e648796c37ec7c044a58ecfbb70e7620cf67494fce6a8052bbce6901caa85a529a5d35001ccafe236c7dd0b97efcd7e399d3
7
- data.tar.gz: 995318581a91f8fbc41ddf97258777ba3942e6e7c69ab1b303208be8d7f1aecca44c2b0561c1a10adc70f5055e583223f97ad8fcca7787a8165f85ff37f8a8e8
6
+ metadata.gz: 90a225fc3be21ca3a273ff7304c5c803d72ed372449a53143090909b48e4272ba8cf9f47bc0522dc502e50582ebcd0c76c3dffce4ef605c7322e3448c9a0ab48
7
+ data.tar.gz: cdf1f709324975a459b808e7c855efcfa981cee5f96839905ab3744cd52fa80a7247f199957617059b1f509211582aa32558c1a8e666b827ae303a503f29f6b2
@@ -23,8 +23,8 @@ module Konjak
23
23
 
24
24
  def compile_pattern
25
25
  regexp = Regexp.escape(text)
26
- regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
27
- regexp.gsub!(/(?<!^)(?:\\s)+(?!$)/) {|s| s + POSSESSIVE_QUALIFIER }
26
+ regexp.gsub!(/(?<!^)\\\s/) { WHITE_SPACE_PATTERN_TEXT }
27
+ regexp.gsub!(/(?<!^)(?:\\s)+/) {|s| s + POSSESSIVE_QUALIFIER }
28
28
  Regexp.compile(regexp)
29
29
  end
30
30
 
@@ -16,22 +16,81 @@ module Konjak
16
16
  end
17
17
 
18
18
  def segmentize(text)
19
- segments = [text]
20
- translation_units(text).each do |translation_unit|
21
- segment = translation_unit.variant(@lang).segment
19
+ range_segment_pairs = []
22
20
 
23
- pat = compile_pattern(segment)
21
+ translation_units.each {|tu|
22
+ segment = tu.variant(@lang).segment
23
+ text.scan(compile_pattern(segment)) {
24
+ range_segment_pairs << [($~.begin(0)...$~.end(0)), segment]
25
+ }
26
+ }
27
+
28
+ # Can't split text
29
+ return [text] if range_segment_pairs.empty?
30
+
31
+ range_segment_pairs.uniq! {|rsp| [rsp[0], rsp[1].text] }
32
+ range_segment_pairs.sort_by! {|(m, s)|
33
+ [m.begin, -s.text.size]
34
+ }
24
35
 
25
- segments.map! {|text|
26
- next text if text.length < min_segment_length
27
- next text if text.is_a?(SegmentString)
36
+ max_weight_range_segments = max_weight_range_segments(range_segment_pairs)
28
37
 
29
- split(pat, segment, text)
30
- }.flatten!
38
+ segments = []
39
+ prev_text_index = 0
40
+ max_weight_range_segments.each do |(range, segment)|
41
+ prev_text = text[prev_text_index...range.begin]
42
+
43
+ segments << prev_text unless prev_text.empty?
44
+
45
+ segments << SegmentString.new(text[range.begin, range.size], segment)
46
+
47
+ prev_text_index = range.end
31
48
  end
49
+ after_text = text[prev_text_index..-1]
50
+ segments << after_text unless after_text.empty?
32
51
  segments
33
52
  end
34
53
 
54
+
55
+ def max_weight_range_segments(range_segment_pairs)
56
+ edges = []
57
+ prev_nodes = Array.new(range_segment_pairs.size, -1)
58
+ weights = range_segment_pairs.map {|rsp| rsp[0].size }
59
+
60
+ range_segment_pairs.each_with_index do |rsp, rsp_i|
61
+ ((rsp_i + 1)...range_segment_pairs.size).each do |rsp2_i|
62
+ rsp2 = range_segment_pairs[rsp2_i]
63
+
64
+ next if rsp2[0].begin < rsp[0].end
65
+
66
+ edges << [rsp_i, rsp2_i]
67
+ end
68
+ end
69
+
70
+ edges.each do |(rsp_i, rsp2_i)|
71
+ new_rsp2_weight = weights[rsp_i] + range_segment_pairs[rsp2_i][0].size
72
+
73
+ if weights[rsp2_i] < new_rsp2_weight
74
+ weights[rsp2_i] = new_rsp2_weight
75
+ prev_nodes[rsp2_i] = rsp_i
76
+ end
77
+ end
78
+
79
+ node_index = weights.index(weights.max)
80
+
81
+ max_weight_range_segment_indexes = Enumerator.new {|y|
82
+ loop do
83
+ break if node_index == -1
84
+ y << node_index
85
+ node_index = prev_nodes[node_index]
86
+ end
87
+ }.to_a.reverse
88
+
89
+ max_weight_range_segment_indexes.map {|i|
90
+ range_segment_pairs[i]
91
+ }
92
+ end
93
+
35
94
  private
36
95
 
37
96
  def default_options
@@ -49,73 +108,16 @@ module Konjak
49
108
  @options[:max_segment_length]
50
109
  end
51
110
 
52
- def split(pat, segment, text)
53
- texts = []
54
- while true
55
- break if text.length < min_segment_length
56
-
57
- break unless text =~ pat
58
-
59
- head = $`
60
- match = $&
61
- tail = $'
62
-
63
- texts << head unless head.empty?
64
-
65
- texts << SegmentString.new(match, segment)
66
-
67
- text = tail
68
- end
69
- texts << text
70
- end
71
-
72
- def translation_units(text)
73
- tus = @tmx.body.translation_units
74
-
75
- tus.select! {|tu|
111
+ def translation_units
112
+ @translation_units ||= @tmx.body.translation_units.select {|tu|
76
113
  segment = tu.variant(@lang).segment
77
114
  segment_length = segment.text.length
78
115
 
79
116
  next false if segment_length < min_segment_length
80
117
  next false if max_segment_length && max_segment_length < segment_length
81
118
 
82
- text =~ compile_pattern(tu.variant(@lang).segment)
119
+ true
83
120
  }
84
-
85
- simular_translation_units_map = {}
86
-
87
- tus.sort_by! {|tu|
88
- tu_segment = tu.variant(@lang).segment
89
- segment_text = tu_segment.text
90
-
91
- unless simular_translation_units_map[segment_text]
92
- simular_translation_units = tus.select {|tu2|
93
- tu2.variant(@lang).segment.text.include?(segment_text)
94
- }.sort_by! {|tu2| tu2.variant(@lang).segment.text.size }
95
-
96
- simular_translation_units.each do |tu2|
97
- simular_translation_units_map[tu2.variant(@lang).segment.text] = simular_translation_units
98
- end
99
- end
100
-
101
- rank = simular_translation_units_map[segment_text].index {|tu2|
102
- tu2.variant(@lang).segment.text == segment_text
103
- }
104
-
105
- # GTTの場合
106
- translation_timestamp = nil
107
- if tm_entry = tu.at('entry_metadata').try(:at, 'tm_entry')
108
- source_info = tm_entry.at('source_info')
109
- if source_info.try(:at, 'source_lang').try(:text) == @lang && source_info.try(:at, 'source').try(:text) == segment_text
110
- translation_timestamp = tm_entry.at('translation').try(:attr, 'translation_timestamp').to_i
111
- end
112
- end
113
- translation_timestamp ||= 0
114
-
115
- [-rank, -translation_timestamp, -segment_text.length]
116
- }
117
-
118
- tus
119
121
  end
120
122
  end
121
123
  end
@@ -1,3 +1,3 @@
1
1
  module Konjak
2
- VERSION = "0.0.18"
2
+ VERSION = "0.0.19"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: konjak
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.18
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Seiei Higa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-18 00:00:00.000000000 Z
11
+ date: 2015-09-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport