text_alignment 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/align_annotations +3 -1
- data/lib/text_alignment/anchor_finder.rb +10 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19a2dfcf8dfffa752dfc0c3363d2d3e1cb3ef7498f79023cdd16e38aa8c46afd
|
4
|
+
data.tar.gz: 94d925dfc71d24b05fd6861a4f7f7344428b68785db84eeae8f430563b4e3318
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72e61cf30c98df2c3d5ac19717c813c936b55daad22cb8c6e8b44bdb45321dab98c69d5f90820e9993d86263b04bdad2e96e8010afc8a57eee916126b673c8cc
|
7
|
+
data.tar.gz: d92c04294d58845f4a88cb8d9e3db42e9a18e0dd02d0398e3a95bf94662f64a33752754dd637163ef9bc4af77dc602c12fe683d49e9ac0ebb61a2469e5e08216
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,8 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
pp alignment
|
107
|
+
|
106
108
|
# alignment.block_alignments.each do |a|
|
107
109
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
108
110
|
# # p [a[:source], a[:target]]
|
@@ -159,7 +161,7 @@ if lost_annotations
|
|
159
161
|
warn "#{lost_annotations.length}"
|
160
162
|
end
|
161
163
|
|
162
|
-
puts target_annotations.to_json
|
164
|
+
#puts target_annotations.to_json
|
163
165
|
|
164
166
|
# denotations = anns1[:denotations]
|
165
167
|
|
@@ -3,9 +3,9 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s2_prev = 0
|
26
27
|
end
|
27
28
|
|
28
29
|
def get_next_anchor
|
@@ -31,15 +32,16 @@ class TextAlignment::AnchorFinder
|
|
31
32
|
anchor = @s1[@beg_s1, @size_ngram]
|
32
33
|
|
33
34
|
search_position = 0
|
35
|
+
# search_position = @end_s2_prev
|
34
36
|
while @beg_s2 = @s2.index(anchor, search_position)
|
35
37
|
# if both the begining points are sufficiantly close to the end points of the last match
|
36
|
-
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
|
38
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
37
39
|
|
38
40
|
left_window_s1, left_window_s2 = get_left_windows
|
39
|
-
break if left_window_s1
|
41
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
40
42
|
|
41
43
|
right_window_s1, right_window_s2 = get_right_windows
|
42
|
-
break if right_window_s2 && text_similarity(right_window_s1,
|
44
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
43
45
|
|
44
46
|
search_position = @beg_s2 + 1
|
45
47
|
end
|
@@ -107,7 +109,7 @@ class TextAlignment::AnchorFinder
|
|
107
109
|
end
|
108
110
|
|
109
111
|
def get_right_windows
|
110
|
-
return if (@beg_s1 + @size_ngram
|
112
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
111
113
|
|
112
114
|
window_s1 = ''
|
113
115
|
loc = @beg_s1 + @size_ngram
|
@@ -137,6 +139,7 @@ class TextAlignment::AnchorFinder
|
|
137
139
|
end
|
138
140
|
|
139
141
|
def text_similarity(str1, str2, ngram_order = 2)
|
142
|
+
return 0 if str1.nil? || str2.nil?
|
140
143
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
141
144
|
end
|
142
145
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|