glossarist 2.6.2 → 2.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/.rubocop_todo.yml +58 -16
- data/Gemfile +3 -19
- data/README.adoc +117 -0
- data/lib/glossarist/cli/import_command.rb +54 -0
- data/lib/glossarist/cli.rb +29 -8
- data/lib/glossarist/designation/expression.rb +1 -2
- data/lib/glossarist/designation/graphical_symbol.rb +1 -1
- data/lib/glossarist/managed_concept.rb +1 -1
- data/lib/glossarist/rdf/skos_concept.rb +0 -1
- data/lib/glossarist/rdf/skos_vocabulary.rb +0 -1
- data/lib/glossarist/sts/extracted_designation.rb +14 -0
- data/lib/glossarist/sts/extracted_lang_set.rb +16 -0
- data/lib/glossarist/sts/extracted_term.rb +13 -0
- data/lib/glossarist/sts/import_result.rb +24 -0
- data/lib/glossarist/sts/importer.rb +253 -0
- data/lib/glossarist/sts/term_extractor.rb +186 -0
- data/lib/glossarist/sts/term_mapper.rb +118 -0
- data/lib/glossarist/sts.rb +87 -0
- data/lib/glossarist/transforms/concept_to_skos_transform.rb +0 -2
- data/lib/glossarist/version.rb +1 -1
- data/lib/glossarist.rb +10 -7
- metadata +11 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1f3a8ec372c1c3e7a93ed7c2bad8ed2837f8f5bcd5ce4ae340bbb9f3b5ddaa75
|
|
4
|
+
data.tar.gz: e7c0672fc648ea748cff12bfc00a1ea62665aeaa20e4cf8a86dde1419a6094df
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5a3654b99b5137104e26830fe77b1b6bad3eb2e0ce4ffa45d479b909399c469c41edbd7460ede72dc4f10bf94cc2f40649e78ce8510113ea9d97f0715750af15
|
|
7
|
+
data.tar.gz: eec5c75fd4a6a434999830038642ce387d74c2b7df976343d4b342aa919d1aaf7c4beccff4f92fbb18a0f0a4acf762885bf837edad7f16dd9889a93b33ed5613
|
data/.gitignore
CHANGED
data/.rubocop_todo.yml
CHANGED
|
@@ -1,59 +1,93 @@
|
|
|
1
1
|
# This configuration was generated by
|
|
2
2
|
# `rubocop --auto-gen-config`
|
|
3
|
-
# on 2026-05-
|
|
3
|
+
# on 2026-05-12 04:13:45 UTC using RuboCop version 1.86.1.
|
|
4
4
|
# The point is for the user to remove these configuration records
|
|
5
5
|
# one by one as the offenses are removed from the code base.
|
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
|
8
8
|
|
|
9
|
+
# Offense count: 7
|
|
10
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
11
|
+
# Configuration parameters: TreatCommentsAsGroupSeparators, ConsiderPunctuation.
|
|
12
|
+
Bundler/OrderedGems:
|
|
13
|
+
Exclude:
|
|
14
|
+
- 'Gemfile'
|
|
15
|
+
|
|
9
16
|
# Offense count: 1
|
|
10
17
|
Gemspec/RequiredRubyVersion:
|
|
11
18
|
Exclude:
|
|
12
19
|
- 'glossarist.gemspec'
|
|
13
20
|
|
|
14
|
-
# Offense count:
|
|
21
|
+
# Offense count: 4
|
|
15
22
|
# This cop supports safe autocorrection (--autocorrect).
|
|
16
23
|
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
|
17
24
|
# SupportedStyles: with_first_argument, with_fixed_indentation
|
|
18
25
|
Layout/ArgumentAlignment:
|
|
19
26
|
Exclude:
|
|
20
|
-
- '
|
|
27
|
+
- 'lib/glossarist/sts/import_result.rb'
|
|
28
|
+
- 'lib/glossarist/sts/importer.rb'
|
|
29
|
+
- 'lib/glossarist/sts/term_mapper.rb'
|
|
21
30
|
|
|
22
|
-
# Offense count:
|
|
31
|
+
# Offense count: 1
|
|
32
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
33
|
+
# Configuration parameters: IndentationWidth.
|
|
34
|
+
Layout/AssignmentIndentation:
|
|
35
|
+
Exclude:
|
|
36
|
+
- 'lib/glossarist/sts/term_mapper.rb'
|
|
37
|
+
|
|
38
|
+
# Offense count: 6
|
|
23
39
|
# This cop supports safe autocorrection (--autocorrect).
|
|
24
40
|
# Configuration parameters: EnforcedStyleAlignWith.
|
|
25
41
|
# SupportedStylesAlignWith: either, start_of_block, start_of_line
|
|
26
42
|
Layout/BlockAlignment:
|
|
27
43
|
Exclude:
|
|
28
|
-
- 'lib/glossarist/
|
|
44
|
+
- 'lib/glossarist/sts/term_extractor.rb'
|
|
45
|
+
- 'spec/unit/sts/term_extractor_spec.rb'
|
|
46
|
+
- 'spec/unit/sts/term_mapper_spec.rb'
|
|
29
47
|
|
|
30
|
-
# Offense count:
|
|
48
|
+
# Offense count: 6
|
|
31
49
|
# This cop supports safe autocorrection (--autocorrect).
|
|
32
50
|
Layout/BlockEndNewline:
|
|
33
51
|
Exclude:
|
|
34
|
-
- 'lib/glossarist/
|
|
52
|
+
- 'lib/glossarist/sts/term_extractor.rb'
|
|
53
|
+
- 'spec/unit/sts/term_extractor_spec.rb'
|
|
54
|
+
- 'spec/unit/sts/term_mapper_spec.rb'
|
|
35
55
|
|
|
36
|
-
# Offense count:
|
|
56
|
+
# Offense count: 1
|
|
57
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
58
|
+
# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
|
|
59
|
+
# SupportedHashRocketStyles: key, separator, table
|
|
60
|
+
# SupportedColonStyles: key, separator, table
|
|
61
|
+
# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
|
|
62
|
+
Layout/HashAlignment:
|
|
63
|
+
Exclude:
|
|
64
|
+
- 'lib/glossarist/sts/importer.rb'
|
|
65
|
+
|
|
66
|
+
# Offense count: 12
|
|
37
67
|
# This cop supports safe autocorrection (--autocorrect).
|
|
38
68
|
# Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
|
|
39
69
|
# SupportedStylesAlignWith: start_of_line, relative_to_receiver
|
|
40
70
|
Layout/IndentationWidth:
|
|
41
71
|
Exclude:
|
|
42
|
-
- 'lib/glossarist/
|
|
72
|
+
- 'lib/glossarist/sts/term_extractor.rb'
|
|
73
|
+
- 'spec/unit/sts/term_extractor_spec.rb'
|
|
74
|
+
- 'spec/unit/sts/term_mapper_spec.rb'
|
|
43
75
|
|
|
44
|
-
# Offense count:
|
|
76
|
+
# Offense count: 236
|
|
45
77
|
# This cop supports safe autocorrection (--autocorrect).
|
|
46
78
|
# Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
47
79
|
# URISchemes: http, https
|
|
48
80
|
Layout/LineLength:
|
|
49
81
|
Enabled: false
|
|
50
82
|
|
|
51
|
-
# Offense count:
|
|
83
|
+
# Offense count: 7
|
|
52
84
|
# This cop supports safe autocorrection (--autocorrect).
|
|
53
85
|
# Configuration parameters: AllowInHeredoc.
|
|
54
86
|
Layout/TrailingWhitespace:
|
|
55
87
|
Exclude:
|
|
56
|
-
- '
|
|
88
|
+
- 'lib/glossarist/sts/import_result.rb'
|
|
89
|
+
- 'lib/glossarist/sts/importer.rb'
|
|
90
|
+
- 'lib/glossarist/sts/term_mapper.rb'
|
|
57
91
|
|
|
58
92
|
# Offense count: 1
|
|
59
93
|
# Configuration parameters: AllowedMethods.
|
|
@@ -106,12 +140,12 @@ Metrics/CyclomaticComplexity:
|
|
|
106
140
|
- 'lib/glossarist/transforms/concept_to_skos_transform.rb'
|
|
107
141
|
- 'lib/glossarist/transforms/concept_to_tbx_transform.rb'
|
|
108
142
|
|
|
109
|
-
# Offense count:
|
|
143
|
+
# Offense count: 47
|
|
110
144
|
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
111
145
|
Metrics/MethodLength:
|
|
112
146
|
Max: 42
|
|
113
147
|
|
|
114
|
-
# Offense count:
|
|
148
|
+
# Offense count: 4
|
|
115
149
|
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
116
150
|
Metrics/ParameterLists:
|
|
117
151
|
Max: 6
|
|
@@ -145,7 +179,7 @@ Naming/VariableNumber:
|
|
|
145
179
|
Exclude:
|
|
146
180
|
- 'spec/unit/rdf/skos_vocabulary_spec.rb'
|
|
147
181
|
|
|
148
|
-
# Offense count:
|
|
182
|
+
# Offense count: 9
|
|
149
183
|
# This cop supports safe autocorrection (--autocorrect).
|
|
150
184
|
# Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
|
|
151
185
|
# SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
|
|
@@ -154,7 +188,9 @@ Naming/VariableNumber:
|
|
|
154
188
|
# AllowedMethods: lambda, proc, it
|
|
155
189
|
Style/BlockDelimiters:
|
|
156
190
|
Exclude:
|
|
157
|
-
- 'lib/glossarist/
|
|
191
|
+
- 'lib/glossarist/sts/term_extractor.rb'
|
|
192
|
+
- 'spec/unit/sts/term_extractor_spec.rb'
|
|
193
|
+
- 'spec/unit/sts/term_mapper_spec.rb'
|
|
158
194
|
|
|
159
195
|
# Offense count: 6
|
|
160
196
|
# This cop supports safe autocorrection (--autocorrect).
|
|
@@ -163,6 +199,12 @@ Style/BlockDelimiters:
|
|
|
163
199
|
Style/FormatStringToken:
|
|
164
200
|
EnforcedStyle: unannotated
|
|
165
201
|
|
|
202
|
+
# Offense count: 2
|
|
203
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
204
|
+
Style/MultilineIfModifier:
|
|
205
|
+
Exclude:
|
|
206
|
+
- 'lib/glossarist/sts/importer.rb'
|
|
207
|
+
|
|
166
208
|
# Offense count: 1
|
|
167
209
|
# Configuration parameters: AllowedClasses.
|
|
168
210
|
Style/OneClassPerFile:
|
data/Gemfile
CHANGED
|
@@ -6,29 +6,13 @@ gemspec
|
|
|
6
6
|
|
|
7
7
|
gem "canon"
|
|
8
8
|
gem "lutaml-model", "~> 0.8.0"
|
|
9
|
+
gem "nokogiri"
|
|
9
10
|
gem "rake", "~> 13.0"
|
|
11
|
+
gem "relaton", "~> 2.1.0"
|
|
10
12
|
gem "rspec", "~> 3.0"
|
|
11
13
|
gem "rubocop"
|
|
12
14
|
gem "rubocop-performance"
|
|
13
15
|
gem "rubocop-rake"
|
|
14
16
|
gem "rubocop-rspec"
|
|
17
|
+
gem "sts", "~> 0.5.6"
|
|
15
18
|
gem "tbx", "~> 0.1"
|
|
16
|
-
|
|
17
|
-
# Override relaton gems with lutaml-model 0.8 compatible versions.
|
|
18
|
-
# Released 2.0.0 gems have untyped lutaml-model attributes that fail with 0.8+.
|
|
19
|
-
# lutaml-integration branches have typed attributes and relaton-bib ~> 2.1.0.
|
|
20
|
-
# TODO: Remove once relaton gems release versions with lutaml-model 0.8 support.
|
|
21
|
-
gem "relaton-3gpp", github: "relaton/relaton-3gpp",
|
|
22
|
-
branch: "lutaml-integration"
|
|
23
|
-
gem "relaton-bib", github: "relaton/relaton-bib", branch: "lutaml-integration"
|
|
24
|
-
gem "relaton-bipm", github: "relaton/relaton-bipm",
|
|
25
|
-
branch: "lutaml-integration"
|
|
26
|
-
gem "relaton-bsi", github: "relaton/relaton-bsi", branch: "lutaml-integration"
|
|
27
|
-
gem "relaton-calconnect", github: "relaton/relaton-calconnect",
|
|
28
|
-
branch: "lutaml-integration"
|
|
29
|
-
gem "relaton-ccsds", github: "relaton/relaton-ccsds",
|
|
30
|
-
branch: "lutaml-integration"
|
|
31
|
-
gem "relaton-cen", github: "relaton/relaton-cen", branch: "lutaml-integration"
|
|
32
|
-
gem "relaton-iec", github: "relaton/relaton-iec", branch: "lutaml-integration"
|
|
33
|
-
gem "relaton-iso", github: "relaton/relaton-iso", branch: "lutaml-integration"
|
|
34
|
-
gem "relaton-itu", github: "relaton/relaton-itu", branch: "lutaml-integration"
|
data/README.adoc
CHANGED
|
@@ -507,6 +507,123 @@ puts skos.to_jsonld
|
|
|
507
507
|
puts skos.to_turtle
|
|
508
508
|
----
|
|
509
509
|
|
|
510
|
+
=== import
|
|
511
|
+
|
|
512
|
+
Import terminology concepts from STS XML files into a new or existing dataset.
|
|
513
|
+
|
|
514
|
+
[,bash]
|
|
515
|
+
----
|
|
516
|
+
# Import one or more STS XML files into a new dataset directory
|
|
517
|
+
glossarist import iso-8373.xml -o output_dir
|
|
518
|
+
|
|
519
|
+
# Import into a new GCR package (--shortname and --version required)
|
|
520
|
+
glossarist import iso-8373.xml -o iso-8373.gcr \
|
|
521
|
+
--shortname iso-8373 --version 1.0.0 --title "ISO 8373 Robotics"
|
|
522
|
+
|
|
523
|
+
# Import multiple files into a new dataset
|
|
524
|
+
glossarist import iso-8373.xml iso-9000.xml -o combined_dataset
|
|
525
|
+
|
|
526
|
+
# Import into an existing dataset (dedup by designation + domain)
|
|
527
|
+
glossarist import iso-8373.xml --into existing_dataset/
|
|
528
|
+
|
|
529
|
+
# Import into an existing GCR (re-packages automatically)
|
|
530
|
+
glossarist import iso-8373.xml --into existing.gcr
|
|
531
|
+
|
|
532
|
+
# Control duplicate handling
|
|
533
|
+
glossarist import iso-8373.xml --into existing_dataset/ --on-duplicate replace
|
|
534
|
+
----
|
|
535
|
+
|
|
536
|
+
Deduplication is based on **designation + domain** (case-insensitive). When
|
|
537
|
+
duplicates are found, the `--on-duplicate` strategy determines the behavior:
|
|
538
|
+
|
|
539
|
+
[cols="1,2"]
|
|
540
|
+
|===
|
|
541
|
+
|`skip` (default)
|
|
542
|
+
|Keep the existing concept, skip the new one
|
|
543
|
+
|
|
544
|
+
|`replace`
|
|
545
|
+
|Replace the existing concept with the new one
|
|
546
|
+
|
|
547
|
+
|`merge`
|
|
548
|
+
|Add new localizations to the existing concept (e.g. add French to an English-only concept)
|
|
549
|
+
|===
|
|
550
|
+
|
|
551
|
+
Options:
|
|
552
|
+
[cols="1,1"]
|
|
553
|
+
|===
|
|
554
|
+
|o, --output
|
|
555
|
+
|Output directory or `.gcr` file path (new dataset)
|
|
556
|
+
|
|
557
|
+
|--into
|
|
558
|
+
|Path to existing dataset directory or `.gcr` file to merge into
|
|
559
|
+
|
|
560
|
+
|--shortname
|
|
561
|
+
|Dataset shortname (required for GCR output)
|
|
562
|
+
|
|
563
|
+
|--version
|
|
564
|
+
|Dataset version (required for GCR output)
|
|
565
|
+
|
|
566
|
+
|--title
|
|
567
|
+
|Dataset title
|
|
568
|
+
|
|
569
|
+
|--description
|
|
570
|
+
|Dataset description
|
|
571
|
+
|
|
572
|
+
|--owner
|
|
573
|
+
|Dataset owner
|
|
574
|
+
|
|
575
|
+
|--uri-prefix
|
|
576
|
+
|URI prefix for the dataset
|
|
577
|
+
|
|
578
|
+
|--on-duplicate
|
|
579
|
+
|How to handle duplicates: `skip`, `replace`, or `merge`
|
|
580
|
+
|===
|
|
581
|
+
|
|
582
|
+
Ruby API:
|
|
583
|
+
[,ruby]
|
|
584
|
+
----
|
|
585
|
+
require "glossarist/sts"
|
|
586
|
+
|
|
587
|
+
importer = Glossarist::Sts::Importer.new
|
|
588
|
+
|
|
589
|
+
# Import into a new dataset directory
|
|
590
|
+
result = importer.import_new(
|
|
591
|
+
["iso-8373.xml", "iso-9000.xml"],
|
|
592
|
+
output: "output_dir",
|
|
593
|
+
)
|
|
594
|
+
puts result.concepts.length # total concepts imported
|
|
595
|
+
puts result.conflicts.length # duplicates detected
|
|
596
|
+
puts result.skipped_count # skipped (strategy: skip)
|
|
597
|
+
|
|
598
|
+
# Import into a new GCR package
|
|
599
|
+
result = importer.import_new(
|
|
600
|
+
["iso-8373.xml"],
|
|
601
|
+
output: "iso-8373.gcr",
|
|
602
|
+
shortname: "iso-8373",
|
|
603
|
+
version: "1.0.0",
|
|
604
|
+
title: "ISO 8373 Robotics Vocabulary",
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Import into an existing dataset with merge strategy
|
|
608
|
+
importer = Glossarist::Sts::Importer.new(duplicate_strategy: :merge)
|
|
609
|
+
result = importer.import_into_existing(
|
|
610
|
+
["french_supplement.xml"],
|
|
611
|
+
"existing_dataset/",
|
|
612
|
+
)
|
|
613
|
+
result.concepts.each do |mc|
|
|
614
|
+
puts "#{mc.data.id}: #{mc.localizations.keys.join(', ')}"
|
|
615
|
+
end
|
|
616
|
+
----
|
|
617
|
+
|
|
618
|
+
==== Import result
|
|
619
|
+
|
|
620
|
+
`import_new` and `import_into_existing` return an `ImportResult` with:
|
|
621
|
+
|
|
622
|
+
concepts:: `Array<ManagedConcept>` — the imported concepts
|
|
623
|
+
conflicts:: `Array<DuplicateConflict>` — duplicate pairs detected by designation + domain
|
|
624
|
+
source_files:: `Array<String>` — the input file paths
|
|
625
|
+
skipped_count:: `Integer` — concepts skipped due to duplicates (strategy: skip)
|
|
626
|
+
|
|
510
627
|
=== validate
|
|
511
628
|
|
|
512
629
|
Validate a dataset directory or `.gcr` file for schema compliance.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Glossarist
|
|
4
|
+
class CLI
|
|
5
|
+
class ImportCommand
|
|
6
|
+
def initialize(files, options)
|
|
7
|
+
@files = files
|
|
8
|
+
@options = options
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def run
|
|
12
|
+
importer = Sts::Importer.new(
|
|
13
|
+
duplicate_strategy: @options[:on_duplicate]&.to_sym || :skip,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
result = if @options[:into]
|
|
17
|
+
importer.import_into_existing(@files, @options[:into])
|
|
18
|
+
else
|
|
19
|
+
importer.import_new(@files, **import_new_args)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
print_summary(result)
|
|
23
|
+
rescue ArgumentError => e
|
|
24
|
+
warn "Error: #{e.message}"
|
|
25
|
+
exit 1
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def import_new_args
|
|
31
|
+
{
|
|
32
|
+
output: @options[:output],
|
|
33
|
+
shortname: @options[:shortname],
|
|
34
|
+
version: @options[:version],
|
|
35
|
+
title: @options[:title],
|
|
36
|
+
description: @options[:description],
|
|
37
|
+
owner: @options[:owner],
|
|
38
|
+
uri_prefix: @options[:uri_prefix],
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def print_summary(result) # rubocop:disable Metrics/AbcSize
|
|
43
|
+
dest = @options[:into] || @options[:output]
|
|
44
|
+
puts "Imported #{result.concepts.length} concepts to #{dest}"
|
|
45
|
+
puts " Source files: #{@files.join(', ')}" if @files.any?
|
|
46
|
+
return unless result.conflict?
|
|
47
|
+
|
|
48
|
+
puts " #{result.conflicts.length} duplicate(s) detected " \
|
|
49
|
+
"(strategy: #{@options[:on_duplicate] || 'skip'})"
|
|
50
|
+
puts " #{result.skipped_count} concept(s) skipped" if result.skipped_count.positive?
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
data/lib/glossarist/cli.rb
CHANGED
|
@@ -4,6 +4,11 @@ require "thor"
|
|
|
4
4
|
|
|
5
5
|
module Glossarist
|
|
6
6
|
class CLI < Thor
|
|
7
|
+
autoload :UpgradeCommand, "#{__dir__}/cli/upgrade_command"
|
|
8
|
+
autoload :PackageCommand, "#{__dir__}/cli/package_command"
|
|
9
|
+
autoload :ValidateCommand, "#{__dir__}/cli/validate_command"
|
|
10
|
+
autoload :ImportCommand, "#{__dir__}/cli/import_command"
|
|
11
|
+
autoload :ExportCommand, "#{__dir__}/cli/export_command"
|
|
7
12
|
desc "generate_latex", "Convert Concepts to Latex format"
|
|
8
13
|
|
|
9
14
|
option :concepts_path, aliases: :p, required: true,
|
|
@@ -38,8 +43,7 @@ module Glossarist
|
|
|
38
43
|
option :dry_run, type: :boolean, default: false,
|
|
39
44
|
desc: "Show what would change without writing"
|
|
40
45
|
def upgrade(source_dir)
|
|
41
|
-
|
|
42
|
-
Glossarist::CLI::UpgradeCommand.new(source_dir, options).run
|
|
46
|
+
CLI::UpgradeCommand.new(source_dir, options).run
|
|
43
47
|
end
|
|
44
48
|
|
|
45
49
|
desc "package DIR", "Create a .gcr ZIP archive from a schema v1 dataset"
|
|
@@ -62,8 +66,7 @@ module Glossarist
|
|
|
62
66
|
option :concept_uri_template, type: :string,
|
|
63
67
|
desc: "URI template for concept URIs"
|
|
64
68
|
def package(dir)
|
|
65
|
-
|
|
66
|
-
Glossarist::CLI::PackageCommand.new(dir, options).run
|
|
69
|
+
CLI::PackageCommand.new(dir, options).run
|
|
67
70
|
end
|
|
68
71
|
|
|
69
72
|
desc "validate PATH",
|
|
@@ -76,8 +79,27 @@ module Glossarist
|
|
|
76
79
|
option :reference_path, type: :string,
|
|
77
80
|
desc: "Path to directory of .gcr files for cross-dataset reference validation"
|
|
78
81
|
def validate(path)
|
|
79
|
-
|
|
80
|
-
|
|
82
|
+
CLI::ValidateCommand.new(path, options).run
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
desc "import FILES...", "Import terms from STS XML files"
|
|
86
|
+
option :output, aliases: :o, type: :string,
|
|
87
|
+
desc: "Output directory or .gcr file path (new dataset)"
|
|
88
|
+
option :into, type: :string,
|
|
89
|
+
desc: "Path to existing dataset directory or .gcr file to merge into"
|
|
90
|
+
option :shortname, type: :string,
|
|
91
|
+
desc: "Dataset shortname (required for GCR output)"
|
|
92
|
+
option :version, type: :string,
|
|
93
|
+
desc: "Dataset version (required for GCR output)"
|
|
94
|
+
option :title, type: :string, desc: "Dataset title"
|
|
95
|
+
option :description, type: :string, desc: "Dataset description"
|
|
96
|
+
option :owner, type: :string, desc: "Dataset owner"
|
|
97
|
+
option :uri_prefix, type: :string, desc: "URI prefix for the dataset"
|
|
98
|
+
option :on_duplicate, type: :string, default: "skip",
|
|
99
|
+
enum: %w[skip replace merge],
|
|
100
|
+
desc: "How to handle duplicate concepts (designation + domain)"
|
|
101
|
+
def import(*files)
|
|
102
|
+
CLI::ImportCommand.new(files, options).run
|
|
81
103
|
end
|
|
82
104
|
|
|
83
105
|
desc "export PATH", "Export concepts in machine-readable formats"
|
|
@@ -95,8 +117,7 @@ module Glossarist
|
|
|
95
117
|
option :title, type: :string,
|
|
96
118
|
desc: "Dataset title for document header"
|
|
97
119
|
def export(path)
|
|
98
|
-
|
|
99
|
-
Glossarist::CLI::ExportCommand.new(path, options).run
|
|
120
|
+
CLI::ExportCommand.new(path, options).run
|
|
100
121
|
end
|
|
101
122
|
|
|
102
123
|
def method_missing(*args)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Glossarist
|
|
4
|
+
module Sts
|
|
5
|
+
ExtractedLangSet = Struct.new(
|
|
6
|
+
:language_code,
|
|
7
|
+
:definition_text,
|
|
8
|
+
:note_texts,
|
|
9
|
+
:example_texts,
|
|
10
|
+
:source_texts,
|
|
11
|
+
:domain,
|
|
12
|
+
:designations,
|
|
13
|
+
keyword_init: true,
|
|
14
|
+
)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Glossarist
|
|
4
|
+
module Sts
|
|
5
|
+
DuplicateConflict = Struct.new(:new_concept, :existing_concept, :key,
|
|
6
|
+
keyword_init: true)
|
|
7
|
+
|
|
8
|
+
class ImportResult
|
|
9
|
+
attr_reader :concepts, :conflicts, :source_files, :skipped_count
|
|
10
|
+
|
|
11
|
+
def initialize(concepts:, conflicts: [], source_files: [],
|
|
12
|
+
skipped_count: 0)
|
|
13
|
+
@concepts = concepts
|
|
14
|
+
@conflicts = conflicts
|
|
15
|
+
@source_files = source_files
|
|
16
|
+
@skipped_count = skipped_count
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def conflict?
|
|
20
|
+
!conflicts.empty?
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "tmpdir"
|
|
4
|
+
require_relative "import_result"
|
|
5
|
+
|
|
6
|
+
module Glossarist
|
|
7
|
+
module Sts
|
|
8
|
+
class Importer
|
|
9
|
+
STRATEGIES = %i[skip replace merge].freeze
|
|
10
|
+
|
|
11
|
+
attr_reader :duplicate_strategy
|
|
12
|
+
|
|
13
|
+
def initialize(duplicate_strategy: :skip)
|
|
14
|
+
unless STRATEGIES.include?(duplicate_strategy)
|
|
15
|
+
raise ArgumentError,
|
|
16
|
+
"duplicate_strategy must be one of #{STRATEGIES.join(', ')}, got #{duplicate_strategy}"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
@duplicate_strategy = duplicate_strategy
|
|
20
|
+
@mapper = TermMapper.new
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def import_new(xml_files, output:, shortname: nil, version: nil, **opts)
|
|
24
|
+
raw_concepts = extract_all_concepts(xml_files)
|
|
25
|
+
concepts, conflicts, skipped = dedup_concepts(raw_concepts)
|
|
26
|
+
|
|
27
|
+
if output.end_with?(".gcr")
|
|
28
|
+
unless shortname
|
|
29
|
+
raise ArgumentError,
|
|
30
|
+
"--shortname is required for GCR output"
|
|
31
|
+
end
|
|
32
|
+
unless version
|
|
33
|
+
raise ArgumentError,
|
|
34
|
+
"--version is required for GCR output"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
create_gcr(concepts, output, shortname: shortname, version: version,
|
|
38
|
+
**opts)
|
|
39
|
+
else
|
|
40
|
+
save_dataset(concepts, output)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
ImportResult.new(
|
|
44
|
+
concepts: concepts,
|
|
45
|
+
conflicts: conflicts,
|
|
46
|
+
source_files: xml_files,
|
|
47
|
+
skipped_count: skipped,
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def import_into_existing(xml_files, dataset_path)
|
|
52
|
+
existing = load_existing(dataset_path)
|
|
53
|
+
new_concepts = extract_all_concepts(xml_files)
|
|
54
|
+
index = build_concept_index(existing)
|
|
55
|
+
|
|
56
|
+
result_state = apply_with_dedup(new_concepts, existing, index)
|
|
57
|
+
|
|
58
|
+
save_to_path(existing, dataset_path)
|
|
59
|
+
|
|
60
|
+
ImportResult.new(
|
|
61
|
+
concepts: existing.managed_concepts,
|
|
62
|
+
conflicts: result_state.conflicts,
|
|
63
|
+
source_files: xml_files,
|
|
64
|
+
skipped_count: result_state.skipped,
|
|
65
|
+
)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
DedupState = Struct.new(:conflicts, :skipped, keyword_init: true)
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def apply_with_dedup(new_concepts, existing, index)
|
|
73
|
+
state = DedupState.new(conflicts: [], skipped: 0)
|
|
74
|
+
|
|
75
|
+
new_concepts.each do |mc|
|
|
76
|
+
key = concept_key(mc)
|
|
77
|
+
existing_mc = index[key]
|
|
78
|
+
|
|
79
|
+
if existing_mc.nil?
|
|
80
|
+
existing.store(mc)
|
|
81
|
+
index[key] = mc
|
|
82
|
+
else
|
|
83
|
+
state.conflicts << DuplicateConflict.new(
|
|
84
|
+
new_concept: mc, existing_concept: existing_mc, key: key,
|
|
85
|
+
)
|
|
86
|
+
handle_duplicate(existing, existing_mc, mc, index, key, state)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
state
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def handle_duplicate(existing, old_mc, new_mc, index, key, state)
|
|
94
|
+
case duplicate_strategy
|
|
95
|
+
when :skip
|
|
96
|
+
state.skipped += 1
|
|
97
|
+
when :replace
|
|
98
|
+
replace_in_collection(existing, old_mc, new_mc)
|
|
99
|
+
index[key] = new_mc
|
|
100
|
+
when :merge
|
|
101
|
+
merge_concept(old_mc, new_mc)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def extract_all_concepts(xml_files)
|
|
106
|
+
xml_files.flat_map do |path|
|
|
107
|
+
extractor = TermExtractor.new(path)
|
|
108
|
+
terms = extractor.extract
|
|
109
|
+
terms.map { |t| @mapper.map(t) }
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def dedup_concepts(concepts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
|
|
114
|
+
seen = {}
|
|
115
|
+
conflicts = []
|
|
116
|
+
skipped = 0
|
|
117
|
+
unique = []
|
|
118
|
+
|
|
119
|
+
concepts.each do |mc|
|
|
120
|
+
key = concept_key(mc)
|
|
121
|
+
if key.first.empty? || seen[key].nil?
|
|
122
|
+
unique << mc
|
|
123
|
+
seen[key] = mc unless key.first.empty?
|
|
124
|
+
else
|
|
125
|
+
conflicts << DuplicateConflict.new(
|
|
126
|
+
new_concept: mc, existing_concept: seen[key], key: key,
|
|
127
|
+
)
|
|
128
|
+
skipped += apply_dedup_to_unique(unique, seen, mc, key)
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
[unique, conflicts, skipped]
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def apply_dedup_to_unique(unique, seen, new_mc, key)
|
|
136
|
+
case duplicate_strategy
|
|
137
|
+
when :skip
|
|
138
|
+
1
|
|
139
|
+
when :replace
|
|
140
|
+
unique.delete(seen[key])
|
|
141
|
+
unique << new_mc
|
|
142
|
+
seen[key] = new_mc
|
|
143
|
+
0
|
|
144
|
+
when :merge
|
|
145
|
+
merge_concept(seen[key], new_mc)
|
|
146
|
+
0
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def concept_key(managed_concept)
|
|
151
|
+
designation = managed_concept.default_designation.to_s.downcase.strip
|
|
152
|
+
domain = begin
|
|
153
|
+
l10n = managed_concept.default_lang
|
|
154
|
+
l10n&.data&.domain.to_s.downcase.strip
|
|
155
|
+
end
|
|
156
|
+
[designation, domain]
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def build_concept_index(collection)
|
|
160
|
+
index = {}
|
|
161
|
+
collection.each do |mc|
|
|
162
|
+
key = concept_key(mc)
|
|
163
|
+
index[key] = mc unless key.first.empty?
|
|
164
|
+
end
|
|
165
|
+
index
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def merge_concept(existing_mc, new_mc)
|
|
169
|
+
new_mc.localizations.each do |l10n|
|
|
170
|
+
lang = l10n.language_code
|
|
171
|
+
if existing_mc.localization(lang).nil?
|
|
172
|
+
existing_mc.add_localization(l10n)
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def replace_in_collection(collection, old_mc, new_mc)
|
|
178
|
+
collection.managed_concepts.delete(old_mc)
|
|
179
|
+
collection.store(new_mc)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def load_existing(path)
|
|
183
|
+
collection = ManagedConceptCollection.new
|
|
184
|
+
if path.end_with?(".gcr")
|
|
185
|
+
package = GcrPackage.load(path)
|
|
186
|
+
package.concepts.each { |mc| collection.store(mc) }
|
|
187
|
+
else
|
|
188
|
+
concepts = ConceptCollector.collect(path)
|
|
189
|
+
concepts.each { |mc| collection.store(mc) }
|
|
190
|
+
end
|
|
191
|
+
collection
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def save_to_path(collection, path)
|
|
195
|
+
if path.end_with?(".gcr")
|
|
196
|
+
tmpdir = build_temp_dataset(collection.managed_concepts)
|
|
197
|
+
begin
|
|
198
|
+
GC.start
|
|
199
|
+
tmp_gcr = "#{path}.tmp.#{Process.pid}"
|
|
200
|
+
GcrPackage.create_from_directory(
|
|
201
|
+
tmpdir,
|
|
202
|
+
output: tmp_gcr,
|
|
203
|
+
shortname: File.basename(path, ".gcr"),
|
|
204
|
+
version: "1.0.0",
|
|
205
|
+
)
|
|
206
|
+
FileUtils.rm_f(path)
|
|
207
|
+
FileUtils.mv(tmp_gcr, path)
|
|
208
|
+
ensure
|
|
209
|
+
FileUtils.rm_rf(tmpdir)
|
|
210
|
+
FileUtils.rm_f(tmp_gcr)
|
|
211
|
+
end
|
|
212
|
+
else
|
|
213
|
+
save_dataset(collection.managed_concepts, path)
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def save_dataset(concepts, dir)
|
|
218
|
+
concepts_dir = File.join(dir, "concepts")
|
|
219
|
+
FileUtils.mkdir_p(concepts_dir)
|
|
220
|
+
collection = ManagedConceptCollection.new
|
|
221
|
+
concepts.each { |mc| collection.store(mc) }
|
|
222
|
+
collection.save_grouped_concepts_to_files(concepts_dir)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def create_gcr(concepts, output, shortname:, version:, **opts)
|
|
226
|
+
tmpdir = build_temp_dataset(concepts)
|
|
227
|
+
begin
|
|
228
|
+
GcrPackage.create_from_directory(
|
|
229
|
+
tmpdir,
|
|
230
|
+
output: output,
|
|
231
|
+
shortname: shortname,
|
|
232
|
+
version: version,
|
|
233
|
+
**opts,
|
|
234
|
+
)
|
|
235
|
+
ensure
|
|
236
|
+
FileUtils.rm_rf(tmpdir)
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def build_temp_dataset(concepts)
|
|
241
|
+
tmpdir = Dir.mktmpdir("glossarist-sts-import")
|
|
242
|
+
concepts_dir = File.join(tmpdir, "concepts")
|
|
243
|
+
FileUtils.mkdir_p(concepts_dir)
|
|
244
|
+
|
|
245
|
+
collection = ManagedConceptCollection.new
|
|
246
|
+
concepts.each { |mc| collection.store(mc) }
|
|
247
|
+
collection.save_grouped_concepts_to_files(concepts_dir)
|
|
248
|
+
|
|
249
|
+
tmpdir
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Glossarist
|
|
4
|
+
module Sts
|
|
5
|
+
class TermExtractor
|
|
6
|
+
def initialize(xml_path)
|
|
7
|
+
raw = File.read(xml_path)
|
|
8
|
+
@standard = ::Sts::IsoSts::Standard.from_xml(raw)
|
|
9
|
+
@source_ref = extract_source_ref
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def extract
|
|
13
|
+
term_secs = collect_term_secs
|
|
14
|
+
term_secs.filter_map do |ts|
|
|
15
|
+
next unless ts.term_entry
|
|
16
|
+
|
|
17
|
+
build_extracted_term(ts)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def collect_term_secs
|
|
24
|
+
secs = []
|
|
25
|
+
walk_sections(@standard.body, secs) if @standard.body
|
|
26
|
+
secs
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def walk_sections(container, collected)
|
|
30
|
+
collect_term_secs_from(container, collected)
|
|
31
|
+
walk_child_secs(container, collected)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def collect_term_secs_from(container, collected)
|
|
35
|
+
secs = container.term_sec
|
|
36
|
+
secs&.each do |ts|
|
|
37
|
+
collected << ts
|
|
38
|
+
walk_sections(ts, collected) if ts.term_sec&.any?
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def walk_child_secs(container, collected)
|
|
43
|
+
secs = container_child_secs(container)
|
|
44
|
+
secs&.each { |s| walk_sections(s, collected) }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def container_child_secs(container)
|
|
48
|
+
case container
|
|
49
|
+
when ::Sts::IsoSts::Body, ::Sts::IsoSts::Sec
|
|
50
|
+
container.sec
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def build_extracted_term(term_sec)
|
|
55
|
+
entry = term_sec.term_entry
|
|
56
|
+
label_text = extract_label(term_sec)
|
|
57
|
+
|
|
58
|
+
lang_sets = entry.lang_set.filter_map do |ls|
|
|
59
|
+
build_lang_set(ls)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
Sts::ExtractedTerm.new(
|
|
63
|
+
id: entry.id,
|
|
64
|
+
label: label_text,
|
|
65
|
+
source_ref: @source_ref,
|
|
66
|
+
lang_sets: lang_sets,
|
|
67
|
+
)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def extract_label(term_sec)
|
|
71
|
+
label = term_sec.label
|
|
72
|
+
return nil unless label
|
|
73
|
+
|
|
74
|
+
label.content&.join.to_s.strip
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def build_lang_set(lang_set) # rubocop:disable Metrics/AbcSize
|
|
78
|
+
lang_code = Sts.convert_language_code(lang_set.lang.to_s)
|
|
79
|
+
|
|
80
|
+
Sts::ExtractedLangSet.new(
|
|
81
|
+
language_code: lang_code,
|
|
82
|
+
definition_text: extract_definition_text(lang_set),
|
|
83
|
+
note_texts: extract_note_texts(lang_set),
|
|
84
|
+
example_texts: extract_example_texts(lang_set),
|
|
85
|
+
source_texts: extract_source_texts(lang_set),
|
|
86
|
+
domain: extract_subject_field(lang_set),
|
|
87
|
+
designations: lang_set.tig.filter_map do |tig|
|
|
88
|
+
build_designation(tig)
|
|
89
|
+
end,
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def extract_definition_text(lang_set)
|
|
94
|
+
definitions = lang_set.definition
|
|
95
|
+
return "" unless definitions&.any?
|
|
96
|
+
|
|
97
|
+
definitions.first.value&.join.to_s.strip
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def extract_note_texts(lang_set)
|
|
101
|
+
lang_set.note.filter_map do |n|
|
|
102
|
+
text = n.value&.join.to_s.strip
|
|
103
|
+
text unless text.empty?
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def extract_example_texts(lang_set)
|
|
108
|
+
lang_set.example.filter_map do |e|
|
|
109
|
+
text = e.value&.join.to_s.strip
|
|
110
|
+
text unless text.empty?
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def extract_source_texts(lang_set)
|
|
115
|
+
lang_set.source.filter_map do |s|
|
|
116
|
+
text = s.value&.join.to_s.strip
|
|
117
|
+
text unless text.empty?
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def extract_subject_field(lang_set)
|
|
122
|
+
fields = lang_set.subject_field
|
|
123
|
+
return nil unless fields&.any?
|
|
124
|
+
|
|
125
|
+
text = fields.first.value&.join.to_s.strip
|
|
126
|
+
text unless text.empty?
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def build_designation(tig)
|
|
130
|
+
Sts::ExtractedDesignation.new(
|
|
131
|
+
term: resolve_term_text(tig),
|
|
132
|
+
type: map_term_type(tig),
|
|
133
|
+
normative_status: map_normative_status(tig),
|
|
134
|
+
part_of_speech: tig.pos&.value,
|
|
135
|
+
abbreviation_type: map_abbreviation_type(tig),
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def resolve_term_text(tig)
|
|
140
|
+
tig.term&.value&.join.to_s.strip
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def map_term_type(tig)
|
|
144
|
+
raw = tig.term_type&.value.to_s
|
|
145
|
+
mapped = TERM_TYPE_MAP[raw]
|
|
146
|
+
mapped.nil? || raw.empty? ? "expression" : mapped
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def map_abbreviation_type(tig)
|
|
150
|
+
raw = tig.term_type&.value.to_s
|
|
151
|
+
return nil unless TERM_TYPE_MAP[raw] == "abbreviation"
|
|
152
|
+
|
|
153
|
+
raw == "acronym" ? "acronym" : "truncation"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def map_normative_status(tig)
|
|
157
|
+
NORMATIVE_STATUS_MAP[tig.normative_authorization&.value.to_s]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def extract_source_ref # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
161
|
+
front = @standard.front
|
|
162
|
+
return nil unless front
|
|
163
|
+
|
|
164
|
+
meta = front.iso_meta || front.std_meta
|
|
165
|
+
return nil unless meta
|
|
166
|
+
|
|
167
|
+
refs = meta.std_ref
|
|
168
|
+
return nil unless refs&.any?
|
|
169
|
+
|
|
170
|
+
best_ref = refs.find { |r| r.type == "dated" } ||
|
|
171
|
+
refs.find { |r| r.type == "undated" } ||
|
|
172
|
+
refs.first
|
|
173
|
+
|
|
174
|
+
extract_ref_text(best_ref)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def extract_ref_text(ref)
|
|
178
|
+
if ref.value.is_a?(String)
|
|
179
|
+
ref.value.to_s.strip
|
|
180
|
+
else
|
|
181
|
+
ref.content&.join.to_s.strip
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Glossarist
|
|
4
|
+
module Sts
|
|
5
|
+
class TermMapper
|
|
6
|
+
def map(extracted_term)
|
|
7
|
+
concept_id = extracted_term.label || extracted_term.id
|
|
8
|
+
|
|
9
|
+
mc = Glossarist::ManagedConcept.new(data: { id: concept_id })
|
|
10
|
+
|
|
11
|
+
extracted_term.lang_sets.each do |ls|
|
|
12
|
+
mc.add_localization(build_localized_concept(ls,
|
|
13
|
+
extracted_term.source_ref))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
mc
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def build_localized_concept(lang_set, source_ref)
|
|
22
|
+
terms = lang_set.designations.map { |d| build_designation(d) }
|
|
23
|
+
|
|
24
|
+
Glossarist::LocalizedConcept.of_yaml(
|
|
25
|
+
"data" => {
|
|
26
|
+
"language_code" => lang_set.language_code,
|
|
27
|
+
"terms" => terms,
|
|
28
|
+
"definition" => build_definitions(lang_set.definition_text),
|
|
29
|
+
"notes" => build_detailed_definitions(lang_set.note_texts),
|
|
30
|
+
"examples" => build_detailed_definitions(lang_set.example_texts),
|
|
31
|
+
"sources" => build_sources(lang_set.source_texts, source_ref),
|
|
32
|
+
"domain" => lang_set.domain,
|
|
33
|
+
"entry_status" => "valid",
|
|
34
|
+
},
|
|
35
|
+
)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def build_definitions(text)
|
|
39
|
+
return [] unless text && !text.empty?
|
|
40
|
+
|
|
41
|
+
[{ "content" => text }]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def build_detailed_definitions(texts)
|
|
45
|
+
texts.filter_map do |text|
|
|
46
|
+
next if text.empty?
|
|
47
|
+
|
|
48
|
+
{ "content" => text }
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def build_designation(ext_desig)
|
|
53
|
+
case ext_desig.type
|
|
54
|
+
when "abbreviation"
|
|
55
|
+
build_abbreviation_designation(ext_desig)
|
|
56
|
+
when "symbol"
|
|
57
|
+
build_symbol_designation(ext_desig)
|
|
58
|
+
else
|
|
59
|
+
build_expression_designation(ext_desig)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def build_expression_designation(ext_desig)
|
|
64
|
+
hash = {
|
|
65
|
+
"type" => "expression",
|
|
66
|
+
"designation" => ext_desig.term,
|
|
67
|
+
"normative_status" => ext_desig.normative_status,
|
|
68
|
+
}.compact
|
|
69
|
+
|
|
70
|
+
if ext_desig.part_of_speech
|
|
71
|
+
hash["grammar_info"] =
|
|
72
|
+
[{ "part_of_speech" => ext_desig.part_of_speech }]
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
hash
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def build_abbreviation_designation(ext_desig)
|
|
79
|
+
{
|
|
80
|
+
"type" => "abbreviation",
|
|
81
|
+
"designation" => ext_desig.term,
|
|
82
|
+
"normative_status" => ext_desig.normative_status,
|
|
83
|
+
"abbreviation_type" => ext_desig.abbreviation_type,
|
|
84
|
+
}.compact
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def build_symbol_designation(ext_desig)
|
|
88
|
+
{
|
|
89
|
+
"type" => "symbol",
|
|
90
|
+
"designation" => ext_desig.term,
|
|
91
|
+
"normative_status" => ext_desig.normative_status,
|
|
92
|
+
}.compact
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def build_sources(source_texts, source_ref)
|
|
96
|
+
sources = []
|
|
97
|
+
if source_ref
|
|
98
|
+
sources << {
|
|
99
|
+
"status" => "identical",
|
|
100
|
+
"type" => "authoritative",
|
|
101
|
+
"origin" => { "text" => source_ref },
|
|
102
|
+
}
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
source_texts.each do |text|
|
|
106
|
+
next if text.empty?
|
|
107
|
+
|
|
108
|
+
sources << {
|
|
109
|
+
"type" => "authoritative",
|
|
110
|
+
"origin" => { "text" => text },
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
sources
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sts"
|
|
4
|
+
|
|
5
|
+
module Glossarist
|
|
6
|
+
module Sts
|
|
7
|
+
autoload :ExtractedDesignation, "#{__dir__}/sts/extracted_designation"
|
|
8
|
+
autoload :ExtractedLangSet, "#{__dir__}/sts/extracted_lang_set"
|
|
9
|
+
autoload :ExtractedTerm, "#{__dir__}/sts/extracted_term"
|
|
10
|
+
autoload :ImportResult, "#{__dir__}/sts/import_result"
|
|
11
|
+
autoload :Importer, "#{__dir__}/sts/importer"
|
|
12
|
+
autoload :TermExtractor, "#{__dir__}/sts/term_extractor"
|
|
13
|
+
autoload :TermMapper, "#{__dir__}/sts/term_mapper"
|
|
14
|
+
|
|
15
|
+
ISO_639_1_TO_639_2 = {
|
|
16
|
+
"aa" => "aar", "ab" => "abk", "af" => "afr", "ak" => "aka",
|
|
17
|
+
"am" => "amh", "an" => "arg", "ar" => "ara", "as" => "asm",
|
|
18
|
+
"av" => "ava", "ay" => "aym", "az" => "aze", "ba" => "bak",
|
|
19
|
+
"be" => "bel", "bg" => "bul", "bh" => "bih", "bi" => "bis",
|
|
20
|
+
"bm" => "bam", "bn" => "ben", "bo" => "bod", "br" => "bre",
|
|
21
|
+
"bs" => "bos", "ca" => "cat", "ce" => "che", "ch" => "cha",
|
|
22
|
+
"co" => "cos", "cr" => "cre", "cs" => "ces", "cu" => "chu",
|
|
23
|
+
"cv" => "chv", "cy" => "cym", "da" => "dan", "de" => "deu",
|
|
24
|
+
"dv" => "div", "dz" => "dzo", "ee" => "ewe", "el" => "ell",
|
|
25
|
+
"en" => "eng", "eo" => "epo", "es" => "spa", "et" => "est",
|
|
26
|
+
"eu" => "eus", "fa" => "fas", "ff" => "ful", "fi" => "fin",
|
|
27
|
+
"fj" => "fij", "fo" => "fao", "fr" => "fra", "fy" => "fry",
|
|
28
|
+
"ga" => "gle", "gd" => "gla", "gl" => "glg", "gn" => "grn",
|
|
29
|
+
"gu" => "guj", "gv" => "glv", "ha" => "hau", "he" => "heb",
|
|
30
|
+
"hi" => "hin", "ho" => "hmo", "hr" => "hrv", "ht" => "hat",
|
|
31
|
+
"hu" => "hun", "hy" => "hye", "hz" => "her", "ia" => "ina",
|
|
32
|
+
"id" => "ind", "ie" => "ile", "ig" => "ibo", "ii" => "iii",
|
|
33
|
+
"ik" => "ipk", "io" => "ido", "is" => "isl", "it" => "ita",
|
|
34
|
+
"iu" => "iku", "ja" => "jpn", "jv" => "jav", "ka" => "kat",
|
|
35
|
+
"kg" => "kon", "ki" => "kik", "kj" => "kua", "kk" => "kaz",
|
|
36
|
+
"kl" => "kal", "km" => "khm", "kn" => "kan", "ko" => "kor",
|
|
37
|
+
"kr" => "kau", "ks" => "kas", "ku" => "kur", "kv" => "kom",
|
|
38
|
+
"kw" => "cor", "ky" => "kir", "la" => "lat", "lb" => "ltz",
|
|
39
|
+
"lg" => "lug", "li" => "lim", "ln" => "lin", "lo" => "lao",
|
|
40
|
+
"lt" => "lit", "lu" => "lub", "lv" => "lav", "mg" => "mlg",
|
|
41
|
+
"mh" => "mah", "mi" => "mri", "mk" => "mkd", "ml" => "mal",
|
|
42
|
+
"mn" => "mon", "mr" => "mar", "ms" => "msa", "mt" => "mlt",
|
|
43
|
+
"my" => "mya", "na" => "nau", "nb" => "nob", "nd" => "nde",
|
|
44
|
+
"ne" => "nep", "ng" => "ndo", "nl" => "nld", "nn" => "nno",
|
|
45
|
+
"no" => "nor", "nr" => "nbl", "nv" => "nav", "ny" => "nya",
|
|
46
|
+
"oc" => "oci", "oj" => "oji", "om" => "orm", "or" => "ori",
|
|
47
|
+
"os" => "oss", "pa" => "pan", "pi" => "pli", "pl" => "pol",
|
|
48
|
+
"ps" => "pus", "pt" => "por", "qu" => "que", "rm" => "roh",
|
|
49
|
+
"rn" => "run", "ro" => "ron", "ru" => "rus", "rw" => "kin",
|
|
50
|
+
"sa" => "san", "sc" => "srd", "sd" => "snd", "se" => "sme",
|
|
51
|
+
"sg" => "sag", "si" => "sin", "sk" => "slk", "sl" => "slv",
|
|
52
|
+
"sm" => "smo", "sn" => "sna", "so" => "som", "sq" => "sqi",
|
|
53
|
+
"sr" => "srp", "ss" => "ssw", "st" => "sot", "su" => "sun",
|
|
54
|
+
"sv" => "swe", "sw" => "swa", "ta" => "tam", "te" => "tel",
|
|
55
|
+
"tg" => "tgk", "th" => "tha", "ti" => "tir", "tk" => "tuk",
|
|
56
|
+
"tl" => "tgl", "tn" => "tsn", "to" => "ton", "tr" => "tur",
|
|
57
|
+
"ts" => "tso", "tt" => "tat", "tw" => "twi", "ty" => "tah",
|
|
58
|
+
"ug" => "uig", "uk" => "ukr", "ur" => "urd", "uz" => "uzb",
|
|
59
|
+
"ve" => "ven", "vi" => "vie", "vo" => "vol", "wa" => "wln",
|
|
60
|
+
"wo" => "wol", "xh" => "xho", "yi" => "yid", "yo" => "yor",
|
|
61
|
+
"za" => "zha", "zh" => "zho", "zu" => "zul"
|
|
62
|
+
}.freeze
|
|
63
|
+
|
|
64
|
+
TERM_TYPE_MAP = {
|
|
65
|
+
"acronym" => "abbreviation",
|
|
66
|
+
"abbreviation" => "abbreviation",
|
|
67
|
+
"fullForm" => "expression",
|
|
68
|
+
"symbol" => "symbol",
|
|
69
|
+
"variant" => "expression",
|
|
70
|
+
"equation" => "expression",
|
|
71
|
+
"formula" => "expression",
|
|
72
|
+
}.freeze
|
|
73
|
+
|
|
74
|
+
NORMATIVE_STATUS_MAP = {
|
|
75
|
+
"preferredTerm" => "preferred",
|
|
76
|
+
"admittedTerm" => "admitted",
|
|
77
|
+
"deprecatedTerm" => "deprecated",
|
|
78
|
+
}.freeze
|
|
79
|
+
|
|
80
|
+
def self.convert_language_code(code)
|
|
81
|
+
return code if code.nil?
|
|
82
|
+
return code if code.length == 3
|
|
83
|
+
|
|
84
|
+
ISO_639_1_TO_639_2[code] || code
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
data/lib/glossarist/version.rb
CHANGED
data/lib/glossarist.rb
CHANGED
|
@@ -7,14 +7,13 @@ require "psych"
|
|
|
7
7
|
require "thor"
|
|
8
8
|
require "lutaml/model"
|
|
9
9
|
|
|
10
|
-
require_relative "glossarist/glossary_definition"
|
|
11
|
-
|
|
12
10
|
module Glossarist
|
|
13
11
|
autoload :Asset, "glossarist/asset"
|
|
14
12
|
autoload :Citation, "glossarist/citation"
|
|
15
13
|
autoload :CLI, "glossarist/cli"
|
|
16
14
|
autoload :CollectionConfig, "glossarist/collection_config"
|
|
17
15
|
autoload :Collection, "glossarist/collection"
|
|
16
|
+
autoload :Collections, "glossarist/collections"
|
|
18
17
|
autoload :Concept, "glossarist/concept"
|
|
19
18
|
autoload :ConceptData, "glossarist/concept_data"
|
|
20
19
|
autoload :ConceptReference, "glossarist/concept_reference"
|
|
@@ -35,10 +34,10 @@ module Glossarist
|
|
|
35
34
|
autoload :DetailedDefinition, "glossarist/detailed_definition"
|
|
36
35
|
autoload :Designation, "glossarist/designation"
|
|
37
36
|
autoload :Error, "glossarist/error"
|
|
38
|
-
autoload :GcrPackage,
|
|
39
|
-
autoload :GcrMetadata,
|
|
40
|
-
autoload :GcrStatistics,
|
|
41
|
-
autoload :GcrValidator,
|
|
37
|
+
autoload :GcrPackage, "glossarist/gcr_package"
|
|
38
|
+
autoload :GcrMetadata, "glossarist/gcr_metadata"
|
|
39
|
+
autoload :GcrStatistics, "glossarist/gcr_statistics"
|
|
40
|
+
autoload :GcrValidator, "glossarist/gcr_validator"
|
|
42
41
|
autoload :InvalidTypeError, "glossarist/error/invalid_type_error"
|
|
43
42
|
autoload :InvalidLanguageCodeError,
|
|
44
43
|
"glossarist/error/invalid_language_code_error"
|
|
@@ -52,16 +51,20 @@ module Glossarist
|
|
|
52
51
|
autoload :ManagedConceptData, "glossarist/managed_concept_data"
|
|
53
52
|
autoload :NonVerbRep, "glossarist/non_verb_rep"
|
|
54
53
|
autoload :RelatedConcept, "glossarist/related_concept"
|
|
54
|
+
autoload :Rdf, "glossarist/rdf"
|
|
55
|
+
autoload :Sts, "glossarist/sts"
|
|
56
|
+
autoload :Transforms, "glossarist/transforms"
|
|
55
57
|
autoload :SchemaMigration, "glossarist/schema_migration"
|
|
56
58
|
autoload :UrnResolver, "glossarist/urn_resolver"
|
|
57
59
|
autoload :Utilities, "glossarist/utilities"
|
|
58
|
-
autoload :RegisterData,
|
|
60
|
+
autoload :RegisterData, "glossarist/register_data"
|
|
59
61
|
autoload :ValidationResult, "glossarist/validation_result"
|
|
60
62
|
autoload :V1, "glossarist/v1"
|
|
61
63
|
end
|
|
62
64
|
|
|
63
65
|
require_relative "glossarist/version"
|
|
64
66
|
require_relative "glossarist/collections"
|
|
67
|
+
require_relative "glossarist/glossary_definition"
|
|
65
68
|
|
|
66
69
|
module Glossarist
|
|
67
70
|
LANG_CODES = %w[eng ara deu fra spa ita jpn kor pol por srp swe zho rus fin
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: glossarist
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.6.
|
|
4
|
+
version: 2.6.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: lutaml-model
|
|
@@ -122,6 +122,7 @@ files:
|
|
|
122
122
|
- lib/glossarist/citation.rb
|
|
123
123
|
- lib/glossarist/cli.rb
|
|
124
124
|
- lib/glossarist/cli/export_command.rb
|
|
125
|
+
- lib/glossarist/cli/import_command.rb
|
|
125
126
|
- lib/glossarist/cli/package_command.rb
|
|
126
127
|
- lib/glossarist/cli/upgrade_command.rb
|
|
127
128
|
- lib/glossarist/cli/validate_command.rb
|
|
@@ -192,6 +193,14 @@ files:
|
|
|
192
193
|
- lib/glossarist/resolution_adapter/remote.rb
|
|
193
194
|
- lib/glossarist/resolution_adapter/route.rb
|
|
194
195
|
- lib/glossarist/schema_migration.rb
|
|
196
|
+
- lib/glossarist/sts.rb
|
|
197
|
+
- lib/glossarist/sts/extracted_designation.rb
|
|
198
|
+
- lib/glossarist/sts/extracted_lang_set.rb
|
|
199
|
+
- lib/glossarist/sts/extracted_term.rb
|
|
200
|
+
- lib/glossarist/sts/import_result.rb
|
|
201
|
+
- lib/glossarist/sts/importer.rb
|
|
202
|
+
- lib/glossarist/sts/term_extractor.rb
|
|
203
|
+
- lib/glossarist/sts/term_mapper.rb
|
|
195
204
|
- lib/glossarist/transforms.rb
|
|
196
205
|
- lib/glossarist/transforms/concept_to_skos_transform.rb
|
|
197
206
|
- lib/glossarist/transforms/concept_to_tbx_transform.rb
|