lexm 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/lexm/lemma.rb +80 -47
- data/lib/lexm/lemma_list.rb +46 -0
- data/lib/lexm/sublemma.rb +36 -1
- data/lib/lexm/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c74656fda529cd304f92c513c537445e2e492cab160fb2397905e3e00f9fe059
|
4
|
+
data.tar.gz: fdc730362c3dd42a991ba11fdd96ef5d3ce60cb5332e1d85d4c342409cadc364
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6404296f5a2d1ea6791550e6a87dd8a95e009bdb1bef2fb7aaabb7f6204330243ff1ce0275b004b07a3f2a53e4b07cb680b35c074a3a82cebcff1507581a2328
|
7
|
+
data.tar.gz: a7c7125edb88bfeba32d952a6eabbde18c167755b2f074e562bdd33fc30a9cd073cd4848bfc3ba232515e7791aed4f0c60692fe31442fdd3f9c299ba65e8c3fb
|
data/lib/lexm/lemma.rb
CHANGED
@@ -119,57 +119,75 @@ module LexM
|
|
119
119
|
# @param sublemmasPart [String] sublemmas part string
|
120
120
|
# @return [void]
|
121
121
|
def parseSublemmas(sublemmasPart)
|
122
|
-
#
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
elsif sublemmasPart =~ />(.+)/
|
130
|
-
redirect = LemmaRedirect.new($1.strip)
|
131
|
-
@sublemmas << Sublemma.new(nil, redirect)
|
132
|
-
end
|
133
|
-
else
|
134
|
-
# Split the sublemmas and process each one
|
135
|
-
sublemmas = sublemmasPart.split(',')
|
122
|
+
# We need a smarter way to split sublemmas that respects parentheses
|
123
|
+
# This helps us correctly handle cases like ">(sp,pp)wring,abc"
|
124
|
+
sublemmas = smart_split_sublemmas(sublemmasPart)
|
125
|
+
|
126
|
+
# Process each sublemma
|
127
|
+
sublemmas.each do |sublemma|
|
128
|
+
sublemma = sublemma.strip
|
136
129
|
|
137
|
-
#
|
138
|
-
|
139
|
-
sublemma
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
text = $1.strip
|
160
|
-
redirect = LemmaRedirect.new($2.strip)
|
161
|
-
@sublemmas << Sublemma.new(text, redirect)
|
162
|
-
else
|
163
|
-
@sublemmas << Sublemma.new(sublemma)
|
164
|
-
end
|
130
|
+
# Handle pure redirection sublemma (starts with >)
|
131
|
+
if sublemma.start_with?('>')
|
132
|
+
if sublemma =~ />\((.+?)\)(.+)/
|
133
|
+
redirect = LemmaRedirect.new($2.strip, $1.split(',').map(&:strip))
|
134
|
+
@sublemmas << Sublemma.new(nil, redirect, self)
|
135
|
+
elsif sublemma =~ />(.+)/
|
136
|
+
redirect = LemmaRedirect.new($1.strip)
|
137
|
+
@sublemmas << Sublemma.new(nil, redirect, self)
|
138
|
+
end
|
139
|
+
# Handle normal sublemma with possible redirection
|
140
|
+
elsif sublemma.include?('>')
|
141
|
+
# Check for a redirection with relation types
|
142
|
+
if sublemma =~ /(.+?)>\((.+?)\)(.+)/
|
143
|
+
# Format: word>(relation)target
|
144
|
+
text = $1.strip
|
145
|
+
redirect = LemmaRedirect.new($3.strip, $2.split(',').map(&:strip))
|
146
|
+
@sublemmas << Sublemma.new(text, redirect, self)
|
147
|
+
elsif sublemma =~ /(.+?)>(.+)/
|
148
|
+
# Simple redirection without relation type
|
149
|
+
text = $1.strip
|
150
|
+
redirect = LemmaRedirect.new($2.strip)
|
151
|
+
@sublemmas << Sublemma.new(text, redirect, self)
|
165
152
|
else
|
166
|
-
|
167
|
-
@sublemmas << Sublemma.new(sublemma)
|
153
|
+
@sublemmas << Sublemma.new(sublemma, nil, self)
|
168
154
|
end
|
155
|
+
else
|
156
|
+
# Simple sublemma
|
157
|
+
@sublemmas << Sublemma.new(sublemma, nil, self)
|
169
158
|
end
|
170
159
|
end
|
171
160
|
end
|
172
161
|
|
162
|
+
# Helper method to split sublemmas while respecting parentheses
|
163
|
+
# This ensures we don't split inside relation type lists like (sp,pp)
|
164
|
+
# @param text [String] text to split at commas outside of parentheses
|
165
|
+
# @return [Array<String>] resulting substrings
|
166
|
+
def smart_split_sublemmas(text)
|
167
|
+
result = []
|
168
|
+
current = ""
|
169
|
+
paren_level = 0
|
170
|
+
|
171
|
+
text.each_char do |c|
|
172
|
+
if c == ',' && paren_level == 0
|
173
|
+
# Only split on commas outside of parentheses
|
174
|
+
result << current unless current.empty?
|
175
|
+
current = ""
|
176
|
+
else
|
177
|
+
current << c
|
178
|
+
# Track parenthesis nesting level
|
179
|
+
if c == '('
|
180
|
+
paren_level += 1
|
181
|
+
elsif c == ')'
|
182
|
+
paren_level -= 1 if paren_level > 0
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
result << current unless current.empty?
|
188
|
+
result
|
189
|
+
end
|
190
|
+
|
173
191
|
# Parse annotations like sp:past,pp:participle or pl:oxen
|
174
192
|
# @param annotationsText [String] annotations string
|
175
193
|
# @return [void]
|
@@ -215,7 +233,7 @@ module LexM
|
|
215
233
|
if redirected?
|
216
234
|
raise "Cannot add sublemmas to a redirection lemma"
|
217
235
|
end
|
218
|
-
@sublemmas << Sublemma.new(text)
|
236
|
+
@sublemmas << Sublemma.new(text, nil, self)
|
219
237
|
self
|
220
238
|
end
|
221
239
|
|
@@ -227,7 +245,7 @@ module LexM
|
|
227
245
|
raise "Cannot add sublemmas to a redirection lemma"
|
228
246
|
end
|
229
247
|
texts.each do |text|
|
230
|
-
@sublemmas << Sublemma.new(text)
|
248
|
+
@sublemmas << Sublemma.new(text, nil, self)
|
231
249
|
end
|
232
250
|
self
|
233
251
|
end
|
@@ -241,7 +259,7 @@ module LexM
|
|
241
259
|
raise "Cannot add sublemmas to a redirection lemma"
|
242
260
|
end
|
243
261
|
redirect = LemmaRedirect.new(target, types)
|
244
|
-
@sublemmas << Sublemma.new(nil, redirect)
|
262
|
+
@sublemmas << Sublemma.new(nil, redirect, self)
|
245
263
|
self
|
246
264
|
end
|
247
265
|
|
@@ -257,6 +275,21 @@ module LexM
|
|
257
275
|
self
|
258
276
|
end
|
259
277
|
|
278
|
+
# Returns a hash mapping each sublemma to its shortcut
|
279
|
+
# @param placeholder [String] optional placeholder to use instead of "~" (default: "~")
|
280
|
+
# @return [Hash<String, String>] hash mapping full sublemma text to shortcut
|
281
|
+
def shortcuts(placeholder = "~")
|
282
|
+
return {} if @text.nil? || redirected? || @sublemmas.empty?
|
283
|
+
|
284
|
+
result = {}
|
285
|
+
@sublemmas.each do |sublemma|
|
286
|
+
# Skip redirections and get the shortcut for text sublemmas
|
287
|
+
next if sublemma.redirected? || sublemma.text.nil?
|
288
|
+
result[sublemma.text] = sublemma.shortcut(placeholder)
|
289
|
+
end
|
290
|
+
result
|
291
|
+
end
|
292
|
+
|
260
293
|
# Validate annotation key and value format
|
261
294
|
# Ensures keys and values follow the expected format
|
262
295
|
# @param key [String] annotation key to validate
|
data/lib/lexm/lemma_list.rb
CHANGED
@@ -609,6 +609,52 @@ module LexM
|
|
609
609
|
def [](index)
|
610
610
|
@lemmas[index]
|
611
611
|
end
|
612
|
+
|
613
|
+
# Sort the lemmas based on their headwords (non-destructive)
|
614
|
+
# @param block [Proc] optional custom sort proc
|
615
|
+
# @return [LemmaList] a new sorted lemma list
|
616
|
+
def sort(&block)
|
617
|
+
if block_given?
|
618
|
+
sorted_list = LemmaList.new
|
619
|
+
sorted_list.instance_variable_set(:@lemmas, @lemmas.sort(&block))
|
620
|
+
sorted_list
|
621
|
+
else
|
622
|
+
# Default sort by headword text
|
623
|
+
sorted_list = LemmaList.new
|
624
|
+
sorted_list.instance_variable_set(:@lemmas, @lemmas.sort_by { |lemma| lemma.text.to_s.downcase })
|
625
|
+
sorted_list
|
626
|
+
end
|
627
|
+
end
|
628
|
+
|
629
|
+
# Sort the lemmas based on their headwords (destructive)
|
630
|
+
# @param block [Proc] optional custom sort proc
|
631
|
+
# @return [LemmaList] self
|
632
|
+
def sort!(&block)
|
633
|
+
if block_given?
|
634
|
+
@lemmas.sort!(&block)
|
635
|
+
else
|
636
|
+
# Default sort by headword text
|
637
|
+
@lemmas.sort_by! { |lemma| lemma.text.to_s.downcase }
|
638
|
+
end
|
639
|
+
self
|
640
|
+
end
|
641
|
+
|
642
|
+
# Sort the lemmas using a custom key function (non-destructive)
|
643
|
+
# @param block [Proc] key function to extract sort keys from lemmas
|
644
|
+
# @return [LemmaList] a new sorted lemma list
|
645
|
+
def sort_by(&block)
|
646
|
+
sorted_list = LemmaList.new
|
647
|
+
sorted_list.instance_variable_set(:@lemmas, @lemmas.sort_by(&block))
|
648
|
+
sorted_list
|
649
|
+
end
|
650
|
+
|
651
|
+
# Sort the lemmas using a custom key function (destructive)
|
652
|
+
# @param block [Proc] key function to extract sort keys from lemmas
|
653
|
+
# @return [LemmaList] self
|
654
|
+
def sort_by!(&block)
|
655
|
+
@lemmas.sort_by!(&block)
|
656
|
+
self
|
657
|
+
end
|
612
658
|
|
613
659
|
# Save to a file
|
614
660
|
# @param filename [String] file to save to
|
data/lib/lexm/sublemma.rb
CHANGED
@@ -14,16 +14,20 @@ module LexM
|
|
14
14
|
attr_accessor :text, :redirect
|
15
15
|
# Source location information
|
16
16
|
attr_accessor :source_file, :source_line, :source_column
|
17
|
+
# Reference to parent lemma
|
18
|
+
attr_accessor :parent
|
17
19
|
|
18
20
|
# Initialize a new sublemma
|
19
21
|
# @param text [String, nil] the text of the sublemma (nil for pure redirections)
|
20
22
|
# @param redirect [LemmaRedirect, nil] redirection information (nil for normal sublemmas)
|
23
|
+
# @param parent [Lemma, nil] parent lemma (optional)
|
21
24
|
# @param source_file [String, nil] source file path
|
22
25
|
# @param source_line [Integer, nil] source line number
|
23
26
|
# @param source_column [Integer, nil] source column number
|
24
|
-
def initialize(text = nil, redirect = nil, source_file = nil, source_line = nil, source_column = nil)
|
27
|
+
def initialize(text = nil, redirect = nil, parent = nil, source_file = nil, source_line = nil, source_column = nil)
|
25
28
|
@text = text
|
26
29
|
@redirect = redirect
|
30
|
+
@parent = parent
|
27
31
|
@source_file = source_file
|
28
32
|
@source_line = source_line
|
29
33
|
@source_column = source_column
|
@@ -35,6 +39,37 @@ module LexM
|
|
35
39
|
@text.nil? && !@redirect.nil?
|
36
40
|
end
|
37
41
|
|
42
|
+
# Returns a shortened version of the sublemma text, replacing the lemma part with a placeholder
|
43
|
+
# For example, if the lemma is "work" and sublemma is "work out", this returns "~ out"
|
44
|
+
# @param placeholder [String] optional placeholder to use instead of "~" (default: "~")
|
45
|
+
# @return [String, nil] the shortened sublemma text or nil if this is a redirection sublemma or has no parent
|
46
|
+
def shortcut(placeholder = "~")
|
47
|
+
return nil if redirected? || @text.nil? || @parent.nil? || @parent.text.nil?
|
48
|
+
|
49
|
+
parent_text = @parent.text
|
50
|
+
|
51
|
+
# Check if the sublemma starts with the parent lemma
|
52
|
+
if @text.start_with?(parent_text)
|
53
|
+
# Replace the parent lemma with the placeholder
|
54
|
+
remainder = @text[parent_text.length..-1]
|
55
|
+
|
56
|
+
# If the remainder starts with a space, keep it
|
57
|
+
if remainder.start_with?(" ")
|
58
|
+
return "#{placeholder}#{remainder}"
|
59
|
+
elsif remainder.empty?
|
60
|
+
# For exact matches, just return the placeholder
|
61
|
+
return placeholder
|
62
|
+
else
|
63
|
+
# For cases where the lemma is a prefix but not a whole word
|
64
|
+
# (e.g., lemma "over", sublemma "overdo") - don't create a shortcut
|
65
|
+
return @text
|
66
|
+
end
|
67
|
+
else
|
68
|
+
# If the sublemma doesn't start with the parent lemma, return the full text
|
69
|
+
return @text
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
38
73
|
# Convert to string representation
|
39
74
|
# @return [String] the string representation of this sublemma
|
40
75
|
def to_s
|
data/lib/lexm/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lexm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yanis Zafirópulos
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-03-
|
11
|
+
date: 2025-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|