text_alignment 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_alignment/char_mapping.rb +36 -21
- data/lib/text_alignment/text_alignment.rb +2 -2
- data/lib/text_alignment/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef51eeed4c82f3ddc211a1474a35c8b61a621ed7acb6bd801a62885c2c342448
|
4
|
+
data.tar.gz: 15b1d020f78a96e152459324d921092c5bd39477f856417fd30e4283d22399ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 473a18002d40bf3db81e0f42ffe3fbcc3ff5b1281964c2d5391a57b34ff07ff973ca86e5fe34dcbe2537410b46105e05a60772ffc27ed6667443062558201952
|
7
|
+
data.tar.gz: a8ddf9b9a2bd19b1e7303fcdf83fd862bbe8b601f2de88aa59c5d51b5a7184a0ee00fde86877390267ffd3cf8619c6a52bffabd9cb5b95be4ab34d20f81e1fab
|
@@ -78,6 +78,34 @@ TextAlignment::CHAR_MAPPING = [
|
|
78
78
|
['"', "''"]
|
79
79
|
]
|
80
80
|
|
81
|
+
# build a string of every Unicode whitespace codepoint \s covers:
|
82
|
+
ALL_WS = [
|
83
|
+
"\u0009", # CHARACTER TABULATION
|
84
|
+
"\u000A", # LINE FEED
|
85
|
+
"\u000B", # LINE TABULATION
|
86
|
+
"\u000C", # FORM FEED
|
87
|
+
"\u000D", # CARRIAGE RETURN
|
88
|
+
"\u0020", # SPACE
|
89
|
+
"\u0085", # NEXT LINE
|
90
|
+
"\u00A0", # NO-BREAK SPACE
|
91
|
+
"\u1680", # OGHAM SPACE MARK
|
92
|
+
"\u2000", # EN QUAD
|
93
|
+
"\u2001", # EM QUAD
|
94
|
+
"\u2002", # EN SPACE
|
95
|
+
"\u2003", # EM SPACE
|
96
|
+
"\u2004", # THREE-PER-EM SPACE
|
97
|
+
"\u2005", # FOUR-PER-EM SPACE
|
98
|
+
"\u2006", # SIX-PER-EM SPACE
|
99
|
+
"\u2007", # FIGURE SPACE
|
100
|
+
"\u2008", # PUNCTUATION SPACE
|
101
|
+
"\u2009", # THIN SPACE
|
102
|
+
"\u200A", # HAIR SPACE
|
103
|
+
"\u2028", # LINE SEPARATOR
|
104
|
+
"\u2029", # PARAGRAPH SEPARATOR
|
105
|
+
"\u202F", # NARROW NO-BREAK SPACE
|
106
|
+
"\u205F", # MEDIUM MATHEMATICAL SPACE
|
107
|
+
"\u3000", # IDEOGRAPHIC SPACE
|
108
|
+
].join
|
81
109
|
|
82
110
|
class TextAlignment::CharMapping
|
83
111
|
attr_reader :mapped_text, :index_enmap
|
@@ -101,11 +129,11 @@ class TextAlignment::CharMapping
|
|
101
129
|
end
|
102
130
|
|
103
131
|
def enmap_position(position)
|
104
|
-
@index_enmap[position]
|
132
|
+
@index_enmap[position]
|
105
133
|
end
|
106
134
|
|
107
135
|
def demap_position(position)
|
108
|
-
@index_demap[position]
|
136
|
+
@index_demap[position]
|
109
137
|
end
|
110
138
|
|
111
139
|
def enmap_denotations(denotations)
|
@@ -113,23 +141,6 @@ class TextAlignment::CharMapping
|
|
113
141
|
|
114
142
|
denotations.map do |d|
|
115
143
|
d.dup.merge(span:{begin:enmap_position(d[:span][:begin]), end:enmap_position(d[:span][:end])})
|
116
|
-
rescue ArgumentError => e
|
117
|
-
snippet_begin = d[:span][:begin] - 5
|
118
|
-
if snippet_begin < 0
|
119
|
-
snippet_begin = 0
|
120
|
-
end
|
121
|
-
snippet_end = d[:span][:end] + 5
|
122
|
-
if snippet_end > @text.length
|
123
|
-
snippet_end = @text.length
|
124
|
-
end
|
125
|
-
snippet = @text[snippet_begin ... d[:span][:begin]] + '[' + @text[d[:span][:begin] ... d[:span][:end]] + ']' + @text[d[:span][:end] ... snippet_end]
|
126
|
-
if snippet_begin > 0
|
127
|
-
snippet = '...' + snippet
|
128
|
-
end
|
129
|
-
if snippet_end < @text.length
|
130
|
-
snippet = snippet + '...'
|
131
|
-
end
|
132
|
-
raise ArgumentError, e.message + " (#{snippet})"
|
133
144
|
end
|
134
145
|
end
|
135
146
|
|
@@ -214,11 +225,15 @@ class TextAlignment::CharMapping
|
|
214
225
|
end
|
215
226
|
|
216
227
|
def squeeze_ws_1!(text)
|
217
|
-
text.gsub!(/\s{2,}/, ' ')
|
228
|
+
# Below should have (almost) the same semantics as text.gsub!(/\s{2,}/, ' ')
|
229
|
+
non_space_ws = ALL_WS.delete(" ")
|
230
|
+
text.tr!(non_space_ws, " ")
|
231
|
+
text.squeeze!(" ")
|
218
232
|
end
|
219
233
|
|
220
234
|
def squeeze_ws_0!(text)
|
221
|
-
text.gsub!(/\s+/, '')
|
235
|
+
# Below should have the same semantics as text.gsub!(/\s+/, '')
|
236
|
+
text.delete!(ALL_WS)
|
222
237
|
end
|
223
238
|
|
224
239
|
end
|
@@ -40,12 +40,12 @@ class TextAlignment::TextAlignment
|
|
40
40
|
end
|
41
41
|
|
42
42
|
@mapped_text = @text_mapping.mapped_text
|
43
|
-
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
44
43
|
|
45
44
|
## To generate the block_alignment of the input text against the reference text
|
46
45
|
@blocks = if r = whole_block_alignment(@mapped_text, @mapped_reference_text, @cultivation_map)
|
47
46
|
r
|
48
47
|
else
|
48
|
+
denotations_mapped = @text_mapping.enmap_denotations(denotations)
|
49
49
|
find_block_alignment(@mapped_text, @mapped_reference_text, denotations_mapped, @cultivation_map)
|
50
50
|
end
|
51
51
|
|
@@ -343,7 +343,7 @@ class TextAlignment::TextAlignment
|
|
343
343
|
|
344
344
|
## term-based alignment
|
345
345
|
tblocks = if denotations
|
346
|
-
denotations_in_scope = denotations.select{|d| d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
346
|
+
denotations_in_scope = denotations.select{|d| d[:span][:begin] && d[:span][:end] && d[:span][:begin] >= b1 && d[:span][:end] <= e1}.
|
347
347
|
sort{|d1, d2| d1[:span][:begin] <=> d2[:span][:begin] || d2[:span][:end] <=> d1[:span][:end] }.
|
348
348
|
map{|d| d.merge(lex:str1[d[:span][:begin] ... d[:span][:end]])}
|
349
349
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_alignment
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ruby-dictionary
|
@@ -111,7 +111,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: '0'
|
113
113
|
requirements: []
|
114
|
-
rubygems_version: 3.4.
|
114
|
+
rubygems_version: 3.4.10
|
115
115
|
signing_key:
|
116
116
|
specification_version: 4
|
117
117
|
summary: Ruby class for aligning two character strings
|