text_alignment 0.4 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35f83f76cd3e9ca59604327710d2eac4684b8abca9dbd167b2b391d98ea561f9
4
- data.tar.gz: 77007a1bfcf72d0681ac9d21b35221e7d519a291383f3d0d8fa0574d631b65ea
3
+ metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
4
+ data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
5
5
  SHA512:
6
- metadata.gz: 2f936569e072e6693f279b5d3a349965b286192d8f694fffbbf41110f66afbd5d80f865a9232c2bef60ec94920e0a814142c8756f0ba3ff29a109cbbe1f7abec
7
- data.tar.gz: 955b85aa639f733361f354cc68ac3d8b14a17a14784f1e6ec24f240934ef0d160c6b8cf01232e1e4243c2eecffcaf1665999cc3e455bfb0e61c0dd905d2cb8b4
6
+ metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
7
+ data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
@@ -1,18 +1,15 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
- TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
7
- TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
8
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
9
-
10
7
  class TextAlignment::AnchorFinder
11
8
 
12
9
  def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
13
10
  @size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
14
11
  @size_window = _size_window || TextAlignment::SIZE_WINDOW
15
- @sim_threshold = _size_window || TextAlignment::TEXT_SIMILARITY_TRESHOLD
12
+ @sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
16
13
 
17
14
  @reverse = (target_str.length < source_str.length)
18
15
 
@@ -1,13 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'string-similarity'
3
4
 
4
5
  module TextAlignment; end unless defined? TextAlignment
5
6
 
6
7
  # approximate the location of str1 in str2
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
8
  TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
9
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
10
- TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
11
9
 
12
10
  class << TextAlignment
13
11
 
@@ -16,8 +14,8 @@ class << TextAlignment
16
14
  raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
17
15
  return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
18
16
 
19
- ngram1 = (0 .. str1.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str1[i, TextAlignment::SIGNATURE_NGRAM]}
20
- ngram2 = (0 .. str2.length - TextAlignment::SIGNATURE_NGRAM).collect{|i| str2[i, TextAlignment::SIGNATURE_NGRAM]}
17
+ ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
18
+ ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
21
19
  ngram_shared = ngram1 & ngram2
22
20
 
23
21
  # If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
@@ -45,7 +43,7 @@ class << TextAlignment
45
43
  text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
46
44
  cache["#{fit_begin}-#{fit_end}"] = text_similarity
47
45
 
48
- break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
46
+ break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
49
47
  fit_begin, fit_end = nil, nil
50
48
  end
51
49
  return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
@@ -0,0 +1,7 @@
1
+ module TextAlignment; end unless defined? TextAlignment
2
+
3
+ TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
4
+ TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
5
+ TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
6
+ TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
7
+ TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
9
9
 
10
10
  module TextAlignment; end unless defined? TextAlignment
11
11
 
12
- TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
13
-
14
12
  class TextAlignment::GLCSTextAlignment
15
13
  attr_reader :position_map_begin, :position_map_end
16
14
  attr_reader :common_elements, :mapped_elements
@@ -1,22 +1,22 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_alignment/constants'
2
3
  require 'text_alignment/anchor_finder'
3
4
  require 'text_alignment/mixed_alignment'
4
5
 
5
6
  module TextAlignment; end unless defined? TextAlignment
6
7
 
7
- TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
8
- TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
9
- TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
10
-
11
-
12
8
  class TextAlignment::TextAlignment
13
9
  attr_reader :block_alignments
14
10
  attr_reader :similarity
15
11
  attr_reader :lost_annotations
16
12
 
17
- def initialize(str1, str2, mappings = nil)
13
+ def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
18
14
  raise ArgumentError, "nil string" if str1.nil? || str2.nil?
19
15
 
16
+ size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
17
+ size_window = _size_window || TextAlignment::SIZE_WINDOW
18
+ sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
19
+
20
20
  mappings ||= TextAlignment::MAPPINGS
21
21
 
22
22
  # try exact match
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
26
26
  return @block_alignments
27
27
  end
28
28
 
29
- anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
29
+ anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
30
30
 
31
31
  # To collect matched blocks
32
32
  mblocks = []
@@ -1,3 +1,3 @@
1
1
  class TextAlignment
2
- VERSION = '0.4'
2
+ VERSION = '0.4.2'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_alignment
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.4'
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-09-29 00:00:00.000000000 Z
11
+ date: 2020-10-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-dictionary
@@ -77,6 +77,7 @@ files:
77
77
  - lib/text_alignment.rb
78
78
  - lib/text_alignment/anchor_finder.rb
79
79
  - lib/text_alignment/approximate_fit.rb
80
+ - lib/text_alignment/constants.rb
80
81
  - lib/text_alignment/find_divisions.rb
81
82
  - lib/text_alignment/glcs_alignment.rb
82
83
  - lib/text_alignment/glcs_alignment_fast.rb