text_alignment 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -1
- data/Gemfile.lock +3 -1
- data/bin/align_annotations +4 -0
- data/lib/text_alignment/approximate_fit.rb +31 -17
- data/lib/text_alignment/version.rb +1 -1
- data/text_alignment.gemspec +1 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45bf8e55a66daff4d79765aec0e4d7482151aa7b86749e054e5d93030ffffabe
|
4
|
+
data.tar.gz: 6251835e35077e2ae828666026a663ae63be7f8d7b7d785b4898763021f546e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4750b2507b4eb172123ca9a82320c2456e66d3f06ca3e3bdd52cf31055e9cb46af53013a484dbb87b1bb3236701cc8a69e1c4c64b576ee41a3a4957c3431be65
|
7
|
+
data.tar.gz: 1819b94e47bdf064d53712bb667f15d74f8e37bd49df7976075c3cbdc0a992d25bb318f76bfc0522cea0b594d002bcab0ebcef6112bf9e26128c245a7fb67bcf
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -15,6 +15,7 @@ GEM
|
|
15
15
|
rspec-support (~> 3.0.0)
|
16
16
|
rspec-support (3.0.4)
|
17
17
|
ruby-dictionary (1.1.1)
|
18
|
+
string-similarity (2.1.0)
|
18
19
|
|
19
20
|
PLATFORMS
|
20
21
|
ruby
|
@@ -23,9 +24,10 @@ DEPENDENCIES
|
|
23
24
|
diff-lcs (~> 1.3)
|
24
25
|
rspec (~> 3.0)
|
25
26
|
ruby-dictionary (~> 1.1, >= 1.1.1)
|
27
|
+
string-similarity (~> 2.1)
|
26
28
|
|
27
29
|
RUBY VERSION
|
28
|
-
ruby 2.
|
30
|
+
ruby 2.5.5p157
|
29
31
|
|
30
32
|
BUNDLED WITH
|
31
33
|
1.17.3
|
data/bin/align_annotations
CHANGED
@@ -42,6 +42,10 @@ puts
|
|
42
42
|
puts '[Denotations transformed]'
|
43
43
|
new_denotations = align.transform_hdenotations(denotations)
|
44
44
|
pp new_denotations
|
45
|
+
puts
|
46
|
+
puts '[Annotations transformed]'
|
47
|
+
anns2[:denotations] = new_denotations
|
48
|
+
puts anns2.to_json
|
45
49
|
|
46
50
|
# p align.common_elements
|
47
51
|
# puts "---------------"
|
@@ -1,11 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'string-similarity'
|
3
|
+
|
2
4
|
module TextAlignment; end unless defined? TextAlignment
|
3
5
|
|
4
6
|
# approximate the location of str1 in str2
|
5
7
|
module TextAlignment
|
6
8
|
SIGNATURE_NGRAM = 5
|
7
9
|
MIN_LENGTH_FOR_APPROXIMATION = 50
|
8
|
-
BUFFER_RATE = 0.
|
10
|
+
BUFFER_RATE = 0.1
|
11
|
+
TEXT_SIMILARITY_TRESHOLD = 0.8
|
9
12
|
end
|
10
13
|
|
11
14
|
class << TextAlignment
|
@@ -22,29 +25,40 @@ class << TextAlignment
|
|
22
25
|
# If there is no shared n-gram found, it may mean there is no serious overlap between the two strings
|
23
26
|
return nil, nil if ngram_shared.empty?
|
24
27
|
|
25
|
-
|
26
|
-
|
28
|
+
signature_ngrams = ngram_shared.select{|g| ngram2.count(g) == 1}
|
29
|
+
return nil, nil if signature_ngrams.empty? #raise "no signature ngram"
|
30
|
+
|
31
|
+
fit_begin, fit_end = nil, nil
|
32
|
+
signature_ngrams.each do |signature_ngram|
|
33
|
+
loc_signature_ngram_in_str1 = str1.index(signature_ngram)
|
34
|
+
loc_signature_ngram_in_str2 = str2.index(signature_ngram)
|
27
35
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
fit_begin = 0 if fit_begin < 0
|
36
|
+
# approximate the beginning of the fit
|
37
|
+
fit_begin = loc_signature_ngram_in_str2 - loc_signature_ngram_in_str1 - (loc_signature_ngram_in_str1 * TextAlignment::BUFFER_RATE).to_i
|
38
|
+
fit_begin = 0 if fit_begin < 0
|
32
39
|
|
33
|
-
|
34
|
-
|
40
|
+
# approximate the end of the fit
|
41
|
+
offset_end = str1.length - loc_signature_ngram_in_str1
|
42
|
+
fit_end = loc_signature_ngram_in_str2 + offset_end + (offset_end * TextAlignment::BUFFER_RATE).to_i
|
43
|
+
fit_end = str2.length if fit_end > str2.length
|
35
44
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
return nil, nil if signature_ngram.nil? # raise "no signature ngram"
|
41
|
-
offset = str1.length - str1.rindex(signature_ngram)
|
42
|
-
fit_end = str2.rindex(signature_ngram) + offset + (offset * TextAlignment::BUFFER_RATE).to_i
|
43
|
-
fit_end = str2.length if fit_end > str2.length
|
45
|
+
text_similarity = text_similarity(str1, str2[fit_begin ... fit_end])
|
46
|
+
break if text_similarity > TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
47
|
+
fit_begin, fit_end = nil, nil
|
48
|
+
end
|
44
49
|
|
45
50
|
return nil, nil if fit_begin >= fit_end
|
46
51
|
return fit_begin, fit_end
|
47
52
|
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def text_similarity(str1, str2, ngram_order = 3)
|
57
|
+
_str1 = str1.delete(" \t\r\n")
|
58
|
+
_str2 = str2.delete(" \t\r\n")
|
59
|
+
String::Similarity.cosine(_str1, _str2, ngram:2)
|
60
|
+
end
|
61
|
+
|
48
62
|
end
|
49
63
|
|
50
64
|
if __FILE__ == $0
|
data/text_alignment.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -30,6 +30,20 @@ dependencies:
|
|
30
30
|
- - ">="
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.1.1
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: string-similarity
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '2.1'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - "~>"
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '2.1'
|
33
47
|
- !ruby/object:Gem::Dependency
|
34
48
|
name: rspec
|
35
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
112
|
- !ruby/object:Gem::Version
|
99
113
|
version: '0'
|
100
114
|
requirements: []
|
101
|
-
rubygems_version: 3.0.
|
115
|
+
rubygems_version: 3.0.8
|
102
116
|
signing_key:
|
103
117
|
specification_version: 4
|
104
118
|
summary: Ruby class for aligning two character strings
|