iev 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ module IEV
7
+ # Parses information from the spreadsheet's TERMATTRIBUTE column and alike.
8
+ #
9
+ # @example
10
+ # parser = TermAttrsParser.new(cell_data_string)
11
+ # parser.gender # returns grammatical gender
12
+ # parser.plurality # returns grammatical plurality
13
+ # parser.part_of_speech # returns part of speech
14
+ class TermAttrsParser
15
+ include CLI::UI
16
+ using DataConversions
17
+
18
+ attr_reader :raw_str, :src_str
19
+
20
+ attr_reader :gender, :geographical_area, :part_of_speech, :plurality,
21
+ :prefix, :usage_info
22
+
23
+ PARTS_OF_SPEECH = {
24
+ "adj" => "adj",
25
+ "noun" => "noun",
26
+ "verb" => "verb",
27
+ "名詞" => "noun",
28
+ "動詞" => "verb",
29
+ "形容詞" => "adj",
30
+ "형용사" => "adj",
31
+ "Adjektiv" => "adj",
32
+ }.freeze
33
+
34
+ PREFIX_KEYWORDS = %w[
35
+ Präfix prefix préfixe 接尾語 접두사 przedrostek prefixo 词头
36
+ ].freeze
37
+
38
+ def initialize(attr_str)
39
+ @raw_str = attr_str.dup.freeze
40
+ @src_str = decode_attrs_string(raw_str).freeze
41
+ parse
42
+ end
43
+
44
+ def inspect
45
+ "<ATTRIBUTES: #{src_str}>".freeze
46
+ end
47
+
48
+ private
49
+
50
+ def parse
51
+ curr_str = src_str.dup
52
+
53
+ extract_gender(curr_str)
54
+ extract_plurality(curr_str)
55
+ extract_geographical_area(curr_str)
56
+ extract_part_of_speech(curr_str)
57
+ extract_usage_info(curr_str)
58
+ extract_prefix(curr_str)
59
+
60
+ if /\p{Word}/ =~ curr_str
61
+ debug(
62
+ :term_attributes,
63
+ "Term attributes could not be parsed completely: '#{src_str}'",
64
+ )
65
+ end
66
+ end
67
+
68
+ def extract_gender(str)
69
+ gender_rx = /\b[mfn]\b/
70
+
71
+ @gender = remove_from_string(str, gender_rx)
72
+ end
73
+
74
+ # Must happen after #extract_gender
75
+ def extract_plurality(str)
76
+ plural_rx = /\bpl\b/
77
+
78
+ if remove_from_string(str, plural_rx)
79
+ @plurality = "plural"
80
+ elsif !gender.nil?
81
+ # TODO Really needed?
82
+ @plurality = "singular"
83
+ end
84
+ end
85
+
86
+ # TODO this is likely buggy
87
+ def extract_geographical_area(str)
88
+ ga_rx = /\b[A-Z]{2}$/
89
+
90
+ @geographical_area = remove_from_string(str, ga_rx)
91
+ end
92
+
93
+ def extract_part_of_speech(str)
94
+ pos_rx = %r{
95
+ \b
96
+ #{Regexp.union(PARTS_OF_SPEECH.keys)}
97
+ \b
98
+ }x.freeze
99
+
100
+ removed = remove_from_string(str, pos_rx)
101
+ @part_of_speech = PARTS_OF_SPEECH[removed] || removed
102
+ end
103
+
104
+ def extract_usage_info(str)
105
+ info_rx = %r{
106
+ # regular ASCII less and greater than signs
107
+ < (?<inner>.*?) >
108
+ |
109
+ # < and >, i.e. full-width less and greater than signs
110
+ # which are used instead of ASCII signs in some CJK terms
111
+ \uFF1C (?<inner>.*?) \uFF1E
112
+ }x.freeze
113
+
114
+ remove_from_string(str, info_rx) do |md|
115
+ @usage_info = md[:inner].strip
116
+ end
117
+ end
118
+
119
+ def extract_prefix(str)
120
+ prefix_rx = %r{
121
+ \b
122
+ #{Regexp.union(PREFIX_KEYWORDS)}
123
+ \b
124
+ }x.freeze
125
+
126
+ @prefix = true if remove_from_string(str, prefix_rx)
127
+ end
128
+
129
+ def decode_attrs_string(str)
130
+ str.decode_html || ""
131
+ end
132
+
133
+ def remove_from_string(string, regexp)
134
+ string.sub!(regexp, "")
135
+
136
+ if $~ && block_given?
137
+ yield $~
138
+ else
139
+ $& # removed substring or nil
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,313 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ require "pp"
7
+
8
+ module IEV
9
+ class TermBuilder
10
+ include CLI::UI
11
+ include Utilities
12
+ using DataConversions
13
+
14
+ def initialize(data)
15
+ @data = data
16
+ end
17
+
18
+ def build
19
+ build_term_object
20
+ end
21
+
22
+ def self.build_from(data)
23
+ new(data).build
24
+ end
25
+
26
+ attr_reader :data
27
+
28
+ def find_value_for(key)
29
+ data.fetch(key.to_sym, nil)&.sanitize
30
+ end
31
+
32
+ def flesh_date(incomplete_date)
33
+ return incomplete_date if incomplete_date.nil? || incomplete_date.empty?
34
+
35
+ year, month, day = incomplete_date.split("-")
36
+
37
+ month ||= "01"
38
+ day ||= "01"
39
+
40
+ DateTime.parse("#{year}-#{month}-#{day}").to_s
41
+ end
42
+
43
+ def build_term_object
44
+ set_ui_tag "#{term_id} (#{term_language})"
45
+ progress "Processing term #{term_id} (#{term_language})..."
46
+
47
+ split_definition
48
+
49
+ Glossarist::LocalizedConcept.new(term_hash)
50
+ end
51
+
52
+ def term_hash
53
+ dates = nil
54
+
55
+ if flesh_date(find_value_for("PUBLICATIONDATE"))
56
+ dates = [
57
+ {
58
+ type: :accepted,
59
+ date: flesh_date(find_value_for("PUBLICATIONDATE")),
60
+ },
61
+ {
62
+ type: :amended,
63
+ date: flesh_date(find_value_for("PUBLICATIONDATE")),
64
+ },
65
+ ]
66
+ end
67
+
68
+ {
69
+ id: term_id,
70
+ entry_status: extract_entry_status,
71
+ classification: extract_classification,
72
+ dates: dates,
73
+ review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
74
+ review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
75
+ review_decision_event: "published",
76
+ terms: extract_terms,
77
+ notes: extract_notes,
78
+ examples: extract_examples,
79
+ definition: [{ "content" => extract_definition_value }],
80
+ sources: extract_authoritative_source,
81
+ language_code: term_language,
82
+ related: extract_superseded_concepts,
83
+ }.compact
84
+ end
85
+
86
+ def term_id
87
+ @term_id ||= find_value_for("IEVREF")
88
+ end
89
+
90
+ def term_domain
91
+ @term_domain ||= term_id.slice(0, 3)
92
+ end
93
+
94
+ def term_language
95
+ @term_language ||= find_value_for("LANGUAGE").to_three_char_code
96
+ end
97
+
98
+ # Splits unified definition (from the spreadsheet) into separate
99
+ # definition, examples, and notes strings (for YAMLs).
100
+ #
101
+ # Sets +@definition+, +@examples+ and +@notes+ variables.
102
+ def split_definition
103
+ slicer_rx = %r{
104
+ \s*
105
+ (?:<p>\s*)?
106
+ (
107
+ (?<example>
108
+ # English example
109
+ \bEXAMPLE\b |
110
+ ^\bExamples\s+are\b: |
111
+ ^\bExamples\b: |
112
+ ^\bExample\b: |
113
+ # French examples
114
+ \bEXEMPLE\b |
115
+ ^\bExemples\b:
116
+ )
117
+ |
118
+ (?<note>
119
+ Note\s*\d+\sto\sentry: |
120
+ Note&nbsp;\d+\sto\sentry: |
121
+ Note\s*\d+\sto\sthe\sentry: |
122
+ Note\sto\sentry\s*\d+: |
123
+ Note\s*\d+?\sà\sl['’]article: |
124
+ <NOTE\/?>?\s*\d?\s+.*?– |
125
+ NOTE(?:\s+-)? |
126
+ Note\s+\d+\s– |
127
+ Note&nbsp;\d+\s
128
+ )
129
+ )
130
+ \s*
131
+ }x
132
+
133
+ @examples = []
134
+ @notes = []
135
+ definition_arr = [] # here array for consistent interface
136
+
137
+ next_part_arr = definition_arr
138
+ remaining_str = find_value_for("DEFINITION")
139
+
140
+ while md = remaining_str&.match(slicer_rx)
141
+ next_part = md.pre_match
142
+ next_part.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
143
+ next_part_arr.push(next_part)
144
+ next_part_arr = md[:example] ? @examples : @notes
145
+ # 112-03-17
146
+ # supplements the name of a quantity, especially for a component in a
147
+ # system, to indicate the quotient of that quantity by the total
148
+ # volume
149
+ # <NOTE – Examples: amount-of-substance volume concentration of
150
+ # component B (or concentration of B, in particular, ion
151
+ # concentration), molecular concentration of B, electron concentration
152
+ # (or electron density).
153
+ #
154
+ # In the above case the `Example` is part of the note but the regex
155
+ # above will capture it as an example and will add an empty `Note`
156
+ # and put the rest in an `Example`. So In this case we will replace
157
+ # the `Example` with `[:Example]` and revert it in the next iteration
158
+ # so it will not be caught by the regex.
159
+ remaining_str = md.post_match
160
+ remaining_str.sub!(/^Ex(a|e)mple/, "[:Ex\\1mple]") if md[:note]
161
+ end
162
+
163
+ remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
164
+ next_part_arr.push(remaining_str)
165
+ @definition = definition_arr.first
166
+ @definition = nil if @definition&.empty?
167
+ end
168
+
169
+ def extract_terms
170
+ [
171
+ extract_primary_designation,
172
+ *extract_synonymous_designations,
173
+ extract_international_symbol_designation,
174
+ ].compact
175
+ end
176
+
177
+ def extract_primary_designation
178
+ raw_term = find_value_for("TERM")
179
+ raw_term = "NA" if raw_term == "....."
180
+
181
+ build_expression_designation(
182
+ raw_term,
183
+ attribute_data: find_value_for("TERMATTRIBUTE"),
184
+ status: "preferred",
185
+ )
186
+ end
187
+
188
+ def extract_synonymous_designations
189
+ retval = (1..3).map do |num|
190
+ designations = find_value_for("SYNONYM#{num}") || ""
191
+
192
+ # Some synonyms have more than one entry
193
+ designations.split(/<[pbr]+>/).map do |raw_term|
194
+ build_expression_designation(
195
+ raw_term,
196
+ attribute_data: find_value_for("SYNONYM#{num}ATTRIBUTE"),
197
+ status: find_value_for("SYNONYM#{num}STATUS")&.downcase,
198
+ )
199
+ end
200
+ end
201
+
202
+ retval.flatten.compact
203
+ end
204
+
205
+ def extract_international_symbol_designation
206
+ raw_term = find_value_for("SYMBOLE")
207
+ raw_term && build_symbol_designation(raw_term)
208
+ end
209
+
210
+ def extract_definition_value
211
+ if @definition
212
+ IEV::Converter.mathml_to_asciimath(
213
+ replace_newlines(parse_anchor_tag(@definition, term_domain)),
214
+ ).strip
215
+ end
216
+ end
217
+
218
+ def extract_examples
219
+ @examples.map do |str|
220
+ IEV::Converter.mathml_to_asciimath(
221
+ replace_newlines(parse_anchor_tag(str, term_domain)),
222
+ ).strip
223
+ end
224
+ end
225
+
226
+ def extract_notes
227
+ @notes.map do |str|
228
+ IEV::Converter.mathml_to_asciimath(
229
+ replace_newlines(parse_anchor_tag(str, term_domain)),
230
+ ).strip
231
+ end
232
+ end
233
+
234
+ def extract_entry_status
235
+ case find_value_for("STATUS").downcase
236
+ when "standard" then "valid"
237
+ else nil
238
+ end
239
+ end
240
+
241
+ def extract_classification
242
+ classification_val = find_value_for("SYNONYM1STATUS")
243
+
244
+ case classification_val
245
+ when ""
246
+ "admitted"
247
+ when "认可的", "допустимый", "admitido"
248
+ "admitted"
249
+ when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
250
+ "preferred"
251
+ else
252
+ classification_val
253
+ end
254
+ end
255
+
256
+ def extract_authoritative_source
257
+ source_val = find_value_for("SOURCE")
258
+ return nil if source_val.nil?
259
+
260
+ SourceParser.new(source_val, term_domain)
261
+ .parsed_sources
262
+ .compact
263
+ .map do |source|
264
+ source.merge({ "type" => "authoritative" })
265
+ end
266
+ end
267
+
268
+ def extract_superseded_concepts
269
+ replaces_val = find_value_for("REPLACES")
270
+ return nil if replaces_val.nil?
271
+
272
+ SupersessionParser.new(replaces_val).supersessions
273
+ end
274
+
275
+ private
276
+
277
+ def build_expression_designation(raw_term, attribute_data:, status:)
278
+ term = IEV::Converter.mathml_to_asciimath(
279
+ parse_anchor_tag(raw_term, term_domain),
280
+ )
281
+ term_attributes = TermAttrsParser.new(attribute_data.to_s)
282
+
283
+ statuses = {
284
+ "obsoleto" => "deprecated",
285
+ "напуштен" => "deprecated",
286
+ }
287
+
288
+ {
289
+ "type" => "expression",
290
+ "prefix" => term_attributes.prefix,
291
+ "normative_status" => statuses[status] || status,
292
+ "usage_info" => term_attributes.usage_info,
293
+ "designation" => term,
294
+ "part_of_speech" => term_attributes.part_of_speech,
295
+ "geographical_area" => term_attributes.geographical_area,
296
+ "gender" => term_attributes.gender,
297
+ "plurality" => term_attributes.plurality,
298
+ }.compact
299
+ end
300
+
301
+ def build_symbol_designation(raw_term)
302
+ term = IEV::Converter.mathml_to_asciimath(
303
+ parse_anchor_tag(raw_term, term_domain),
304
+ )
305
+
306
+ {
307
+ "type" => "symbol",
308
+ "designation" => term,
309
+ "international" => true,
310
+ }.compact
311
+ end
312
+ end
313
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IEV
4
+ module Utilities
5
+ SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
6
+ FIGURE_ONE_REGEX =
7
+ "<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?"
8
+ FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}"
9
+ IMAGE_PATH_PREFIX = "image::/assets/images/parts"
10
+
11
+ def parse_anchor_tag(text, term_domain)
12
+ if text
13
+ # Convert IEV term references
14
+ # Convert href links
15
+ # Need to take care of this pattern:
16
+ # `inverse de la <a href="IEV103-06-01">période<a>`
17
+ text.gsub(
18
+ /<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
19
+ '{{\3, \1:\2}}',
20
+ ).gsub(
21
+ /<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
22
+ '{{\3, IEV:\2}}',
23
+ ).gsub(
24
+ # To handle <a> tags without ending tag like
25
+ # `Voir <a href=IEV103-05-21>IEV 103-05-21`
26
+ # for concept '702-03-11' in `fr`
27
+ /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
28
+ '{{\3, IEV:\2}}',
29
+ ).gsub(
30
+ /<a href="?([^<>]*?)"?>(.*?)<\/a>/,
31
+ '\1[\2]',
32
+ ).gsub(
33
+ Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_TWO_REGEX].join),
34
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
35
+ ).gsub(
36
+ Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_ONE_REGEX].join),
37
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
38
+ ).gsub(
39
+ /<img\s+([^<>]+?)\s*>/,
40
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
41
+ ).gsub(
42
+ /<br>/,
43
+ "\n",
44
+ ).gsub(
45
+ /<b>(.*?)<\/b>/,
46
+ "*\\1*",
47
+ )
48
+ end
49
+ end
50
+
51
+ def replace_newlines(input)
52
+ input.gsub('\n', "\n\n")
53
+ .gsub(/<[pbr]+>/, "\n\n")
54
+ .gsub(/\s*\n[\n\s]+/, "\n\n")
55
+ .strip
56
+ end
57
+ end
58
+ end
data/lib/iev/version.rb CHANGED
@@ -1,3 +1,3 @@
1
- module Iev
2
- VERSION = "0.3.1".freeze
1
+ module IEV
2
+ VERSION = "0.3.3".freeze
3
3
  end
data/lib/iev.rb CHANGED
@@ -3,7 +3,27 @@ require "iev/db"
3
3
  require "open-uri"
4
4
  require "nokogiri"
5
5
 
6
- module Iev
6
+ require "benchmark"
7
+ require "creek"
8
+ require "unitsml"
9
+ require "plurimath"
10
+ require "glossarist"
11
+ require "relaton"
12
+ require "relaton_bib"
13
+ require "sequel"
14
+ require "thor"
15
+ require "yaml"
16
+ require "zeitwerk"
17
+
18
+ loader = Zeitwerk::Loader.for_gem
19
+ loader.inflector.inflect(
20
+ "cli" => "CLI",
21
+ "iev" => "IEV",
22
+ "ui" => "UI",
23
+ )
24
+ loader.setup
25
+
26
+ module IEV
7
27
  #
8
28
  # Scrape Electropedia for term.
9
29
  #
@@ -23,6 +43,8 @@ module Iev
23
43
  a = doc&.at(xpath)&.children&.to_xml
24
44
  a&.sub(%r{<br/>.*$}, "")
25
45
  &.sub(%r{, &lt;.*$}, "")
26
- &.gsub(%r{<[^>]*>}, "")&.strip
46
+ &.gsub(%r{<[^<>]*>}, "")&.strip
27
47
  end
28
48
  end
49
+
50
+ require "iev/cli"