text_alignment 0.4 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_alignment/anchor_finder.rb +2 -5
- data/lib/text_alignment/approximate_fit.rb +4 -6
- data/lib/text_alignment/constants.rb +7 -0
- data/lib/text_alignment/glcs_alignment_fast.rb +0 -2
- data/lib/text_alignment/text_alignment.rb +7 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d56b44ee3cc3451b78ddb87134fefd6c043b4730f1170b07069a417bb162bad1
|
4
|
+
data.tar.gz: 67318d70bee5bb82127c98e79e0b45e5e337832a2b8d23469474215168579f20
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60ae2292556d15d860e0c7377be9c1fd5aec6741b26f562fee8c0c2a9a123d0b8c656b2f7f34b8f74f83ea9ac4878b53f17454e3b25d27d00cd98ecf8175a387
|
7
|
+
data.tar.gz: fe8fe545101cddb638c00ffb6f6785f8b2f5ddb109b4010739e399a2c50fe41164d791fe4a5599a76fca5553c494f78fd49630368d48e406a8211fc0516916f5
|
@@ -1,18 +1,15 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
|
-
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
-
TextAlignment::SIZE_WINDOW = 40 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
|
-
|
10
7
|
class TextAlignment::AnchorFinder
|
11
8
|
|
12
9
|
def initialize(source_str, target_str, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
13
10
|
@size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
14
11
|
@size_window = _size_window || TextAlignment::SIZE_WINDOW
|
15
|
-
@sim_threshold =
|
12
|
+
@sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
16
13
|
|
17
14
|
@reverse = (target_str.length < source_str.length)
|
18
15
|
|
@@ -1,13 +1,11 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'string-similarity'
|
3
4
|
|
4
5
|
module TextAlignment; end unless defined? TextAlignment
|
5
6
|
|
6
7
|
# approximate the location of str1 in str2
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
8
|
TextAlignment::MIN_LENGTH_FOR_APPROXIMATION = 50 unless defined? TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
9
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
10
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.7 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
11
9
|
|
12
10
|
class << TextAlignment
|
13
11
|
|
@@ -16,8 +14,8 @@ class << TextAlignment
|
|
16
14
|
raise ArgumentError, 'nil string' if str1.nil? || str2.nil?
|
17
15
|
return 0, str2.length if str2.length < TextAlignment::MIN_LENGTH_FOR_APPROXIMATION
|
18
16
|
|
19
|
-
ngram1 = (0 .. str1.length - TextAlignment::
|
20
|
-
ngram2 = (0 .. str2.length - TextAlignment::
|
17
|
+
ngram1 = (0 .. str1.length - TextAlignment::SIZE_NGRAM).collect{|i| str1[i, TextAlignment::SIZE_NGRAM]}
|
18
|
+
ngram2 = (0 .. str2.length - TextAlignment::SIZE_NGRAM).collect{|i| str2[i, TextAlignment::SIZE_NGRAM]}
|
21
19
|
ngram_shared = ngram1 & ngram2
|
22
20
|
|
23
21
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
@@ -45,7 +43,7 @@ class << TextAlignment
|
|
45
43
|
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
44
|
cache["#{fit_begin}-#{fit_end}"] = text_similarity
|
47
45
|
|
48
|
-
break if text_similarity > TextAlignment::
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
49
47
|
fit_begin, fit_end = nil, nil
|
50
48
|
end
|
51
49
|
return fit_begin, fit_end if fit_begin && fit_end && fit_begin < fit_end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
module TextAlignment; end unless defined? TextAlignment
|
2
|
+
|
3
|
+
TextAlignment::SIZE_NGRAM = 8 unless defined? TextAlignment::SIZE_NGRAM
|
4
|
+
TextAlignment::SIZE_WINDOW = 60 unless defined? TextAlignment::SIZE_WINDOW
|
5
|
+
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
6
|
+
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
7
|
+
TextAlignment::TEXT_SIMILARITY_THRESHOLD = 0.87 unless defined? TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
@@ -9,8 +9,6 @@ require 'text_alignment/mappings'
|
|
9
9
|
|
10
10
|
module TextAlignment; end unless defined? TextAlignment
|
11
11
|
|
12
|
-
TextAlignment::SIGNATURE_NGRAM = 5 unless defined? TextAlignment::SIGNATURE_NGRAM
|
13
|
-
|
14
12
|
class TextAlignment::GLCSTextAlignment
|
15
13
|
attr_reader :position_map_begin, :position_map_end
|
16
14
|
attr_reader :common_elements, :mapped_elements
|
@@ -1,22 +1,22 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_alignment/constants'
|
2
3
|
require 'text_alignment/anchor_finder'
|
3
4
|
require 'text_alignment/mixed_alignment'
|
4
5
|
|
5
6
|
module TextAlignment; end unless defined? TextAlignment
|
6
7
|
|
7
|
-
TextAlignment::SIGNATURE_NGRAM = 7 unless defined? TextAlignment::SIGNATURE_NGRAM
|
8
|
-
TextAlignment::BUFFER_RATE = 0.1 unless defined? TextAlignment::BUFFER_RATE
|
9
|
-
TextAlignment::BUFFER_MIN = 20 unless defined? TextAlignment::BUFFER_MIN
|
10
|
-
|
11
|
-
|
12
8
|
class TextAlignment::TextAlignment
|
13
9
|
attr_reader :block_alignments
|
14
10
|
attr_reader :similarity
|
15
11
|
attr_reader :lost_annotations
|
16
12
|
|
17
|
-
def initialize(str1, str2,
|
13
|
+
def initialize(str1, str2, _size_ngram = nil, _size_window = nil, _text_similiarity_threshold = nil)
|
18
14
|
raise ArgumentError, "nil string" if str1.nil? || str2.nil?
|
19
15
|
|
16
|
+
size_ngram = _size_ngram || TextAlignment::SIZE_NGRAM
|
17
|
+
size_window = _size_window || TextAlignment::SIZE_WINDOW
|
18
|
+
sim_threshold = _text_similiarity_threshold || TextAlignment::TEXT_SIMILARITY_THRESHOLD
|
19
|
+
|
20
20
|
mappings ||= TextAlignment::MAPPINGS
|
21
21
|
|
22
22
|
# try exact match
|
@@ -26,7 +26,7 @@ class TextAlignment::TextAlignment
|
|
26
26
|
return @block_alignments
|
27
27
|
end
|
28
28
|
|
29
|
-
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2)
|
29
|
+
anchor_finder = TextAlignment::AnchorFinder.new(str1, str2, size_ngram, size_window, sim_threshold)
|
30
30
|
|
31
31
|
# To collect matched blocks
|
32
32
|
mblocks = []
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-10-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -77,6 +77,7 @@ files:
|
|
77
77
|
- lib/text_alignment.rb
|
78
78
|
- lib/text_alignment/anchor_finder.rb
|
79
79
|
- lib/text_alignment/approximate_fit.rb
|
80
|
+
- lib/text_alignment/constants.rb
|
80
81
|
- lib/text_alignment/find_divisions.rb
|
81
82
|
- lib/text_alignment/glcs_alignment.rb
|
82
83
|
- lib/text_alignment/glcs_alignment_fast.rb
|