text_alignment 0.3.22 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aa961ae295a43e3912878dab34fac18a8cbca395eea999a781a538ce17c61bf3
4
- data.tar.gz: 38ae815d192104b9bff4664c5749910675ae4e5d9c895fa63c26fdc24f6d7b91
3
+ metadata.gz: 52bc948955e2df858e397b14eabb4411f73b3ff1e4d879ff4b7015d3b5e03308
4
+ data.tar.gz: fd20caec51c95bdc475e0698a52bb7fdebc9e22c43bb47267a883bcc75862268
5
5
  SHA512:
6
- metadata.gz: 185bdcd0298932a09795723085734d79aaddf638a45c871ff1d29e6c89e02041ddd12f848e3350244b27ac050adc7c52f7e1dedfe6664159d781c25d6dd55f1b
7
- data.tar.gz: 0b88cd8aff0f529b5cad32a360aba476e90d525bc77f89c5c5c59dcef5a6d9077c849acdf9f289f61fea2a50ebe927fd9a53c1eb0bdca9c7661900afdaa8fb94
6
+ metadata.gz: dbcb7ab70a64d4a398a5c5761cc5b2f5de6835ccc0e2d0854556f03ef91d0c0294986cc2ff1273788e6b7b0c73dfdf86fd16ee1ef8ce35ecc11d61f8eaab9521
7
+ data.tar.gz: 01d21cdcc0ab81d61e08ff1f52360ba35973756fd5060ce866391ff622d4cf87da945dba43a622a0581d63ee96c8723e1cb28991bfa02f4e1e803896bdc64d7f
@@ -1,17 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
- TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
-
10
7
  class TextAlignment::AnchorFinder
11
8
 
12
- def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil)
9
+ def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
10
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
11
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
12
+ @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
15
13
 
16
14
  @reverse = (target_str.length < source_str.length)
17
15
 
@@ -43,10 +41,10 @@ class TextAlignment::AnchorFinder
43
41
  break if @beg_s1 > 0 && @beg_s2 > 0 && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
44
42
 
45
43
  left_window_s1, left_window_s2 = get_left_windows
46
- break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
44
+ break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > @sim_threshold)
47
45
 
48
46
  right_window_s1, right_window_s2 = get_right_windows
49
- break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
47
+ break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > @sim_threshold)
50
48
 
51
49
  search_position = @beg_s2 + 1
52
50
  end
@@ -87,7 +85,8 @@ class TextAlignment::AnchorFinder
87
85
  private
88
86
 
89
87
  def get_left_windows
90
- return if @beg_s1 < @size_window || @beg_s2 < @size_window
88
+ # commend below with the assumption that the beginning of a document gives a significant locational information
89
+ # return if @beg_s1 < @size_window || @beg_s2 < @size_window
91
90
 
92
91
  window_s1 = ''
93
92
  loc = @beg_s1 - 1
@@ -115,7 +114,8 @@ class TextAlignment::AnchorFinder
115
114
  end
116
115
 
117
116
  def get_right_windows
118
- return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
117
+ # commend below with the assumption that the end of a document gives a significant locational
118
+ # return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
119
119
 
120
120
  window_s1 = ''
121
121
  loc = @beg_s1 + @size_ngram
@@ -1,13 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
7
  # approximate the location of str1 in str2
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
11
9
 
12
10
  class << TextAlignment
13
11
 
@@ -16,8 +14,8 @@ class << TextAlignment
16
14
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
15
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
18
16
 
19
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
17
+ ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
18
+ ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
21
19
  ngram_shared = ngram1 & ngram2
22
20
 
23
21
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
@@ -45,7 +43,7 @@ class << TextAlignment
45
43
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
44
  cache["#{fit_begin}-#{fit_end}"] = text_similarity
47
45
 
48
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
49
47
  fit_begin, fit_end = nil, nil
50
48
  end
51
49
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
@@ -0,0 +1,7 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
+ TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.9 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
12
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
-
14
12
  class TextAlignment::GLCSTextAlignment
15
13
  attr_reader :position_map_begin, :position_map_end
16
14
  attr_reader :common_elements, :mapped_elements
@@ -10,8 +10,6 @@ require 'text_alignment/mappings'
10
10
 
11
11
  module TextAlignment; end unless defined? TextAlignment
12
12
 
13
- TextAlignment::NOMATCH_CHARS = "@^|#$%&_" unless defined? TextAlignment::NOMATCH_CHARS
14
-
15
13
  class TextAlignment::MixedAlignment
16
14
  attr_reader :sdiff
17
15
  attr_reader :position_map_begin, :position_map_end
@@ -21,58 +19,7 @@ class TextAlignment::MixedAlignment
21
19
 
22
20
  def initialize(str1, str2, mappings = [])
23
21
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
24
- raise ArgumentError, "nil mappings" if mappings.nil?
25
-
26
- ## preprocessing
27
- str1 = str1.dup
28
- str2 = str2.dup
29
- mappings = mappings.dup
30
-
31
- ## find the first nomatch character
32
- TextAlignment::NOMATCH_CHARS.each_char do |c|
33
- if str2.index(c).nil?
34
- @nomatch_char1 = c
35
- break
36
- end
37
- end
38
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char1.nil?
39
-
40
- ## find the first nomatch character
41
- TextAlignment::NOMATCH_CHARS.each_char do |c|
42
- if c != @nomatch_char1 && str1.index(c).nil?
43
- @nomatch_char2 = c
44
- break
45
- end
46
- end
47
- raise RuntimeError, "Cannot find nomatch character" if @nomatch_char2.nil?
48
-
49
- # single character mappings
50
- character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
51
- characters_from = character_mappings.collect{|m| m[0]}.join
52
- characters_to = character_mappings.collect{|m| m[1]}.join
53
- characters_to.gsub!(/-/, '\-')
54
-
55
- str1.tr!(characters_from, characters_to)
56
- str2.tr!(characters_from, characters_to)
57
-
58
- mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
59
-
60
- # ASCII foldings
61
- ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
62
- ascii_foldings.each do |f|
63
- from = f[1]
64
-
65
- if str2.index(f[0])
66
- to = f[0] + (@nomatch_char1 * (f[1].length - 1))
67
- str1.gsub!(from, to)
68
- end
69
-
70
- if str1.index(f[0])
71
- to = f[0] + (@nomatch_char2 * (f[1].length - 1))
72
- str2.gsub!(from, to)
73
- end
74
- end
75
- mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
22
+ mappings ||= []
76
23
 
77
24
  _compute_mixed_alignment(str1, str2, mappings)
78
25
  end
@@ -1,23 +1,21 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'text_alignment/anchor_finder'
3
4
  require 'text_alignment/mixed_alignment'
4
5
 
5
6
  module TextAlignment; end unless defined? TextAlignment
6
7
 
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 10 unless defined? TextAlignment::BUFFER_MIN
10
-
8
+ TextAlignment::PADDING_LETTERS = ['@', '^', '|', '#', '$', '%', '&', '_'] unless defined? TextAlignment::PADDING_LETTERS
11
9
 
12
10
  class TextAlignment::TextAlignment
13
11
  attr_reader :block_alignments
14
12
  attr_reader :similarity
15
13
  attr_reader :lost_annotations
16
14
 
17
- def initialize(str1, str2, mappings = nil)
18
- raise ArgumentError, "nil string" if str1.nil? || str2.nil?
15
+ def initialize(_str1, _str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
16
+ raise ArgumentError, "nil string" if _str1.nil? || _str2.nil?
19
17
 
20
- mappings ||= TextAlignment::MAPPINGS
18
+ str1, str2, mappings = string_preprocessing(_str1, _str2)
21
19
 
22
20
  # try exact match
23
21
  block_begin = str2.index(str1)
@@ -26,7 +24,7 @@ class TextAlignment::TextAlignment
26
24
  return @block_alignments
27
25
  end
28
26
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
27
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, _size_ngram, _size_window, _text_similiarity_threshold)
30
28
 
31
29
  # To collect matched blocks
32
30
  mblocks = []
@@ -232,4 +230,60 @@ class TextAlignment::TextAlignment
232
230
  r
233
231
  end
234
232
 
233
+ private
234
+
235
+
236
+ def string_preprocessing(_str1, _str2)
237
+ str1 = _str1.dup
238
+ str2 = _str2.dup
239
+ mappings = TextAlignment::MAPPINGS.dup
240
+
241
+ ## single character mappings
242
+ character_mappings = mappings.select{|m| m[0].length == 1 && m[1].length == 1}
243
+ characters_from = character_mappings.collect{|m| m[0]}.join
244
+ characters_to = character_mappings.collect{|m| m[1]}.join
245
+ characters_to.gsub!(/-/, '\-')
246
+
247
+ str1.tr!(characters_from, characters_to)
248
+ str2.tr!(characters_from, characters_to)
249
+
250
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length == 1}
251
+
252
+ ## long to one character mappings
253
+ pletters = TextAlignment::PADDING_LETTERS
254
+
255
+ # find the padding letter for str1
256
+ padding_letter1 = begin
257
+ i = pletters.index{|l| str2.index(l).nil?}
258
+ raise RuntimeError, "Could not find a padding letter for str1" if i.nil?
259
+ TextAlignment::PADDING_LETTERS[i]
260
+ end
261
+
262
+ # find the padding letter for str2
263
+ padding_letter2 = begin
264
+ i = pletters.index{|l| l != padding_letter1 && str1.index(l).nil?}
265
+ raise RuntimeError, "Could not find a padding letter for str2" if i.nil?
266
+ TextAlignment::PADDING_LETTERS[i]
267
+ end
268
+
269
+ # ASCII foldings
270
+ ascii_foldings = mappings.select{|m| m[0].length == 1 && m[1].length > 1}
271
+ ascii_foldings.each do |f|
272
+ from = f[1]
273
+
274
+ if str2.index(f[0])
275
+ to = f[0] + (padding_letter1 * (f[1].length - 1))
276
+ str1.gsub!(from, to)
277
+ end
278
+
279
+ if str1.index(f[0])
280
+ to = f[0] + (padding_letter2 * (f[1].length - 1))
281
+ str2.gsub!(from, to)
282
+ end
283
+ end
284
+ mappings.delete_if{|m| m[0].length == 1 && m[1].length > 1}
285
+
286
+ [str1, str2, mappings]
287
+ end
288
+
235
289
  end
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.3.22'
2
+ VERSION = '0.4.3'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.22
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-28 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,6 +77,7 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/constants.rb
80
81
  - lib/text_alignment/find_divisions.rb
81
82
  - lib/text_alignment/glcs_alignment.rb
82
83
  - lib/text_alignment/glcs_alignment_fast.rb