text_alignment 0.3.11 → 0.3.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/align_annotations +3 -1
- data/lib/text_alignment/anchor_finder.rb +10 -7
- data/lib/text_alignment/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 19a2dfcf8dfffa752dfc0c3363d2d3e1cb3ef7498f79023cdd16e38aa8c46afd
|
4
|
+
data.tar.gz: 94d925dfc71d24b05fd6861a4f7f7344428b68785db84eeae8f430563b4e3318
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72e61cf30c98df2c3d5ac19717c813c936b55daad22cb8c6e8b44bdb45321dab98c69d5f90820e9993d86263b04bdad2e96e8010afc8a57eee916126b673c8cc
|
7
|
+
data.tar.gz: d92c04294d58845f4a88cb8d9e3db42e9a18e0dd02d0398e3a95bf94662f64a33752754dd637163ef9bc4af77dc602c12fe683d49e9ac0ebb61a2469e5e08216
|
data/bin/align_annotations
CHANGED
@@ -103,6 +103,8 @@ target_annotations = if source_annotations.class == Array
|
|
103
103
|
else
|
104
104
|
alignment = TextAlignment::TextAlignment.new(source_annotations[:text], target_text)
|
105
105
|
|
106
|
+
pp alignment
|
107
|
+
|
106
108
|
# alignment.block_alignments.each do |a|
|
107
109
|
# if a[:alignment].nil? || a[:alignment] == :empty
|
108
110
|
# # p [a[:source], a[:target]]
|
@@ -159,7 +161,7 @@ if lost_annotations
|
|
159
161
|
warn "#{lost_annotations.length}"
|
160
162
|
end
|
161
163
|
|
162
|
-
puts target_annotations.to_json
|
164
|
+
#puts target_annotations.to_json
|
163
165
|
|
164
166
|
# denotations = anns1[:denotations]
|
165
167
|
|
@@ -3,9 +3,9 @@ require 'string-similarity'
|
|
3
3
|
|
4
4
|
module TextAlignment; end unless defined? TextAlignment
|
5
5
|
|
6
|
-
TextAlignment::SIZE_NGRAM =
|
7
|
-
TextAlignment::SIZE_WINDOW =
|
8
|
-
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.
|
6
|
+
TextAlignment::SIZE_NGRAM = 10 unless defined? TextAlignment::SIZE_NGRAM
|
7
|
+
TextAlignment::SIZE_WINDOW = 20 unless defined? TextAlignment::SIZE_WINDOW
|
8
|
+
TextAlignment::TEXT_SIMILARITY_TRESHOLD = 0.8 unless defined? TextAlignment::TEXT_SIMILARITY_TRESHOLD
|
9
9
|
|
10
10
|
class TextAlignment::AnchorFinder
|
11
11
|
|
@@ -23,6 +23,7 @@ class TextAlignment::AnchorFinder
|
|
23
23
|
|
24
24
|
# current position in s1
|
25
25
|
@beg_s1 = 0
|
26
|
+
@end_s2_prev = 0
|
26
27
|
end
|
27
28
|
|
28
29
|
def get_next_anchor
|
@@ -31,15 +32,16 @@ class TextAlignment::AnchorFinder
|
|
31
32
|
anchor = @s1[@beg_s1, @size_ngram]
|
32
33
|
|
33
34
|
search_position = 0
|
35
|
+
# search_position = @end_s2_prev
|
34
36
|
while @beg_s2 = @s2.index(anchor, search_position)
|
35
37
|
# if both the begining points are sufficiantly close to the end points of the last match
|
36
|
-
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 - @end_s2_prev < 5)
|
38
|
+
break if @end_s1_prev && (@beg_s1 - @end_s1_prev < 5) && (@beg_s2 >= @end_s2_prev) && (@beg_s2 - @end_s2_prev < 5)
|
37
39
|
|
38
40
|
left_window_s1, left_window_s2 = get_left_windows
|
39
|
-
break if left_window_s1
|
41
|
+
break if left_window_s1 && (text_similarity(left_window_s1, left_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
40
42
|
|
41
43
|
right_window_s1, right_window_s2 = get_right_windows
|
42
|
-
break if right_window_s2 && text_similarity(right_window_s1,
|
44
|
+
break if right_window_s2 && (text_similarity(right_window_s1, right_window_s2) > TextAlignment::TEXT_SIMILARITY_TRESHOLD)
|
43
45
|
|
44
46
|
search_position = @beg_s2 + 1
|
45
47
|
end
|
@@ -107,7 +109,7 @@ class TextAlignment::AnchorFinder
|
|
107
109
|
end
|
108
110
|
|
109
111
|
def get_right_windows
|
110
|
-
return if (@beg_s1 + @size_ngram
|
112
|
+
return if (@beg_s1 + @size_ngram > (@s1.length - @size_window)) || (@beg_s2 + @size_ngram > (@s2.length - @size_window))
|
111
113
|
|
112
114
|
window_s1 = ''
|
113
115
|
loc = @beg_s1 + @size_ngram
|
@@ -137,6 +139,7 @@ class TextAlignment::AnchorFinder
|
|
137
139
|
end
|
138
140
|
|
139
141
|
def text_similarity(str1, str2, ngram_order = 2)
|
142
|
+
return 0 if str1.nil? || str2.nil?
|
140
143
|
String::Similarity.cosine(str1, str2, ngram:ngram_order)
|
141
144
|
end
|
142
145
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|