suma 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/lib/suma/cli/extract_terms.rb +102 -9
- data/lib/suma/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 955161f5bfcd18f46e1fcafafdc05bea5a7310eaa8899b03552e7d22f6f41795
|
4
|
+
data.tar.gz: ff06abeaf766ce76d814079ef718c6deb45e1dd0cae5d89e9d03d203e3a5635d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2568587bc47fe2eb892fc7e9f8f4981aceaa06eef4b27ad54cb427e9082cc23571a2c26f8ae9380dfcd52fbb40f38a77ad0d2282e7ada54d8744e3e21edcd301
|
7
|
+
data.tar.gz: 4b8505d33b3eef150aeb76f23548f5983740d1940488a0ee1c7d6e515cd249e5fa74fbdadb72543a91104ca9ce609eb0f374dd01c033db8eee2bb7585525b09f
|
data/.gitignore
CHANGED
@@ -12,6 +12,17 @@ module Suma
|
|
12
12
|
# ExtractTerms command using Expressir to extract terms into the
|
13
13
|
# Glossarist v2 format
|
14
14
|
class ExtractTerms < Thor
|
15
|
+
# Matches patterns like "A thing is a type of {{entity}}." or
|
16
|
+
# "An object is a type of a {{entity}}"
|
17
|
+
REDUNDANT_NOTE_REGEX =
|
18
|
+
%r{
|
19
|
+
^An? # Starts with "A" or "An"
|
20
|
+
\s.*?\sis\sa\stype\sof # Text followed by "is a type of"
|
21
|
+
(\sa|\san)? # Optional " a" or " an"
|
22
|
+
\s\{\{[^\}]*\}\} # Text in double curly braces
|
23
|
+
\s*?\.?$ # Optional whitespace and period at the end
|
24
|
+
}x
|
25
|
+
|
15
26
|
desc "extract_terms SCHEMA_MANIFEST_FILE GLOSSARIST_OUTPUT_PATH",
|
16
27
|
"Extract terms from SCHEMA_MANIFEST_FILE into " \
|
17
28
|
"Glossarist v2 format"
|
@@ -129,11 +140,12 @@ module Suma
|
|
129
140
|
data.sources = [source_ref] if source_ref
|
130
141
|
|
131
142
|
# Only assign optional fields if they have content
|
132
|
-
notes = get_entity_notes(entity, schema_domain)
|
143
|
+
notes = get_entity_notes(entity, schema_domain, data.definition)
|
133
144
|
data.notes = notes if notes && !notes.empty?
|
134
145
|
|
135
|
-
examples = get_entity_examples(entity, schema_domain)
|
136
|
-
data.examples = examples if examples && !examples.empty?
|
146
|
+
# examples = get_entity_examples(entity, schema_domain)
|
147
|
+
# data.examples = examples if examples && !examples.empty?
|
148
|
+
data.examples = []
|
137
149
|
end
|
138
150
|
|
139
151
|
Glossarist::LocalizedConcept.new.tap do |concept|
|
@@ -245,9 +257,20 @@ module Suma
|
|
245
257
|
[Glossarist::DetailedDefinition.new(content: definition)]
|
246
258
|
end
|
247
259
|
|
248
|
-
def get_entity_notes(entity, schema_domain)
|
260
|
+
def get_entity_notes(entity, schema_domain, definitions)
|
261
|
+
puts "Extracting notes for entity: #{entity.id}"
|
249
262
|
notes = []
|
250
263
|
|
264
|
+
notes = add_entity_notes(entity, schema_domain, notes)
|
265
|
+
# notes = add_other_notes(entity, schema_domain, notes)
|
266
|
+
notes = only_keep_first_sentence(notes)
|
267
|
+
notes = remove_see_content(notes)
|
268
|
+
notes = remove_redundant_note(notes)
|
269
|
+
notes = remove_invalid_references(notes)
|
270
|
+
compare_with_definitions(notes, definitions)
|
271
|
+
end
|
272
|
+
|
273
|
+
def add_entity_notes(entity, schema_domain, notes)
|
251
274
|
# Add trimmed definition from entity description as first note
|
252
275
|
if entity.remarks && !entity.remarks.empty?
|
253
276
|
trimmed_def = trim_definition(entity.remarks)
|
@@ -258,7 +281,11 @@ module Suma
|
|
258
281
|
end
|
259
282
|
end
|
260
283
|
|
261
|
-
|
284
|
+
notes.compact
|
285
|
+
end
|
286
|
+
|
287
|
+
def add_other_notes(entity, schema_domain, notes)
|
288
|
+
# Add other notes from entity remarks
|
262
289
|
other_notes = [
|
263
290
|
entity.remark_items&.select do |ri|
|
264
291
|
ri.id == "__note"
|
@@ -274,6 +301,67 @@ module Suma
|
|
274
301
|
notes
|
275
302
|
end
|
276
303
|
|
304
|
+
# https://github.com/metanorma/iso-10303/issues/621
|
305
|
+
# 1. First sentence in first paragraph of the entity description
|
306
|
+
# (in EXPRESS remark) becomes NOTE 1 in ISO 10303-2 of the entity.
|
307
|
+
def only_keep_first_sentence(notes)
|
308
|
+
notes.each do |note|
|
309
|
+
# Split by period and take the first sentence
|
310
|
+
# Avoid splitting by pattern like "abc.def"
|
311
|
+
if note&.content
|
312
|
+
new_content = note.content
|
313
|
+
.split(".\n").first.strip
|
314
|
+
.split(". ").first.strip
|
315
|
+
note.content = if new_content.end_with?(".")
|
316
|
+
new_content
|
317
|
+
else
|
318
|
+
"#{new_content}."
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
# https://github.com/metanorma/iso-10303/issues/621
|
325
|
+
# 2. If this first sentence matches the 7-word magic sentence
|
326
|
+
# (2-3 forms of that), it is discarded so there will not be a NOTE 1.
|
327
|
+
def compare_with_definitions(notes, definitions)
|
328
|
+
if notes&.first&.content == definitions&.first&.content
|
329
|
+
# Discarding first note as it matches the definition
|
330
|
+
return []
|
331
|
+
end
|
332
|
+
|
333
|
+
notes
|
334
|
+
end
|
335
|
+
|
336
|
+
# https://github.com/metanorma/iso-10303/issues/621
|
337
|
+
# 3. No reference to any types or attribute or figures allowed in first
|
338
|
+
# sentence. Entity references “{{…}}” are allowed.
|
339
|
+
def remove_invalid_references(notes)
|
340
|
+
notes.reject do |note|
|
341
|
+
note.content.include?("image::") ||
|
342
|
+
note.content.match?(/<<(.*?){1,999}>>/)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
# https://github.com/metanorma/iso-10303/issues/621
|
347
|
+
# 4. Entity notes and examples in EXPRESS remarks are NOT represented in
|
348
|
+
# part 2.
|
349
|
+
def remove_redundant_note(notes)
|
350
|
+
notes.reject do |note|
|
351
|
+
note.content.match?(REDUNDANT_NOTE_REGEX) &&
|
352
|
+
!note.content.include?("\n")
|
353
|
+
end
|
354
|
+
end
|
355
|
+
|
356
|
+
# https://github.com/metanorma/iso-10303/issues/621
|
357
|
+
# 5. If the sentence contains “\s+(see …)”, the contents including the
|
358
|
+
# parentheses are removed.
|
359
|
+
def remove_see_content(notes)
|
360
|
+
notes.each do |note|
|
361
|
+
note.content = note.content.gsub(/\s+\(see(.*?){1,999}\)/, "")
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
277
365
|
def get_entity_examples(entity, schema_domain)
|
278
366
|
examples = entity.remark_items&.select do |ri|
|
279
367
|
ri.id == "__example"
|
@@ -382,13 +470,18 @@ module Suma
|
|
382
470
|
end
|
383
471
|
# rubocop:enable Metrics/MethodLength
|
384
472
|
|
385
|
-
# Replace `<<express:{schema}.{entity}
|
473
|
+
# Replace `<<express:{schema}.{entity}>>` with {{entity}}
|
474
|
+
# and `<<express:{schema}.{entity},{render}>>` with {{entity,render}}
|
386
475
|
def express_reference_to_mention(description)
|
387
476
|
# TODO: Use Expressir to check whether the "entity" is really an
|
388
477
|
# EXPRESS ENTITY. If not, skip the mention.
|
389
|
-
description
|
390
|
-
|
391
|
-
|
478
|
+
description
|
479
|
+
.gsub(/<<express:([^,]+)>>/) do |_match|
|
480
|
+
"{{#{Regexp.last_match[1].split('.').last}}}"
|
481
|
+
end.gsub(/<<express:([^,]+),([^>]+)>>/) do |_match|
|
482
|
+
"{{#{Regexp.last_match[1].split('.').last}," \
|
483
|
+
"#{Regexp.last_match[2]}}}"
|
484
|
+
end
|
392
485
|
end
|
393
486
|
|
394
487
|
def entity_name_to_text(entity_id)
|
data/lib/suma/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: suma
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-07-
|
11
|
+
date: 2025-07-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: expressir
|