iev 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ module IEV
7
+ # Parses information from the spreadsheet's TERMATTRIBUTE column and alike.
8
+ #
9
+ # @example
10
+ # parser = TermAttrsParser.new(cell_data_string)
11
+ # parser.gender # returns grammatical gender
12
+ # parser.plurality # returns grammatical plurality
13
+ # parser.part_of_speech # returns part of speech
14
+ class TermAttrsParser
15
+ include CLI::UI
16
+ using DataConversions
17
+
18
+ attr_reader :raw_str, :src_str
19
+
20
+ attr_reader :gender, :geographical_area, :part_of_speech, :plurality,
21
+ :prefix, :usage_info
22
+
23
+ PARTS_OF_SPEECH = {
24
+ "adj" => "adj",
25
+ "noun" => "noun",
26
+ "verb" => "verb",
27
+ "名詞" => "noun",
28
+ "動詞" => "verb",
29
+ "形容詞" => "adj",
30
+ "형용사" => "adj",
31
+ "Adjektiv" => "adj",
32
+ }.freeze
33
+
34
+ PREFIX_KEYWORDS = %w[
35
+ Präfix prefix préfixe 接尾語 접두사 przedrostek prefixo 词头
36
+ ].freeze
37
+
38
+ def initialize(attr_str)
39
+ @raw_str = attr_str.dup.freeze
40
+ @src_str = decode_attrs_string(raw_str).freeze
41
+ parse
42
+ end
43
+
44
+ def inspect
45
+ "<ATTRIBUTES: #{src_str}>".freeze
46
+ end
47
+
48
+ private
49
+
50
+ def parse
51
+ curr_str = src_str.dup
52
+
53
+ extract_gender(curr_str)
54
+ extract_plurality(curr_str)
55
+ extract_geographical_area(curr_str)
56
+ extract_part_of_speech(curr_str)
57
+ extract_usage_info(curr_str)
58
+ extract_prefix(curr_str)
59
+
60
+ if /\p{Word}/ =~ curr_str
61
+ debug(
62
+ :term_attributes,
63
+ "Term attributes could not be parsed completely: '#{src_str}'",
64
+ )
65
+ end
66
+ end
67
+
68
+ def extract_gender(str)
69
+ gender_rx = /\b[mfn]\b/
70
+
71
+ @gender = remove_from_string(str, gender_rx)
72
+ end
73
+
74
+ # Must happen after #extract_gender
75
+ def extract_plurality(str)
76
+ plural_rx = /\bpl\b/
77
+
78
+ if remove_from_string(str, plural_rx)
79
+ @plurality = "plural"
80
+ elsif !gender.nil?
81
+ # TODO Really needed?
82
+ @plurality = "singular"
83
+ end
84
+ end
85
+
86
+ # TODO this is likely buggy
87
+ def extract_geographical_area(str)
88
+ ga_rx = /\b[A-Z]{2}$/
89
+
90
+ @geographical_area = remove_from_string(str, ga_rx)
91
+ end
92
+
93
+ def extract_part_of_speech(str)
94
+ pos_rx = %r{
95
+ \b
96
+ #{Regexp.union(PARTS_OF_SPEECH.keys)}
97
+ \b
98
+ }x.freeze
99
+
100
+ removed = remove_from_string(str, pos_rx)
101
+ @part_of_speech = PARTS_OF_SPEECH[removed] || removed
102
+ end
103
+
104
+ def extract_usage_info(str)
105
+ info_rx = %r{
106
+ # regular ASCII less and greater than signs
107
+ < (?<inner>.*?) >
108
+ |
109
+ # < and >, i.e. full-width less and greater than signs
110
+ # which are used instead of ASCII signs in some CJK terms
111
+ \uFF1C (?<inner>.*?) \uFF1E
112
+ }x.freeze
113
+
114
+ remove_from_string(str, info_rx) do |md|
115
+ @usage_info = md[:inner].strip
116
+ end
117
+ end
118
+
119
+ def extract_prefix(str)
120
+ prefix_rx = %r{
121
+ \b
122
+ #{Regexp.union(PREFIX_KEYWORDS)}
123
+ \b
124
+ }x.freeze
125
+
126
+ @prefix = true if remove_from_string(str, prefix_rx)
127
+ end
128
+
129
+ def decode_attrs_string(str)
130
+ str.decode_html || ""
131
+ end
132
+
133
+ def remove_from_string(string, regexp)
134
+ string.sub!(regexp, "")
135
+
136
+ if $~ && block_given?
137
+ yield $~
138
+ else
139
+ $& # removed substring or nil
140
+ end
141
+ end
142
+ end
143
+ end
@@ -0,0 +1,313 @@
1
+ # frozen_string_literal: true
2
+
3
+ # (c) Copyright 2020 Ribose Inc.
4
+ #
5
+
6
+ require "pp"
7
+
8
+ module IEV
9
+ class TermBuilder
10
+ include CLI::UI
11
+ include Utilities
12
+ using DataConversions
13
+
14
+ def initialize(data)
15
+ @data = data
16
+ end
17
+
18
+ def build
19
+ build_term_object
20
+ end
21
+
22
+ def self.build_from(data)
23
+ new(data).build
24
+ end
25
+
26
+ attr_reader :data
27
+
28
+ def find_value_for(key)
29
+ data.fetch(key.to_sym, nil)&.sanitize
30
+ end
31
+
32
+ def flesh_date(incomplete_date)
33
+ return incomplete_date if incomplete_date.nil? || incomplete_date.empty?
34
+
35
+ year, month, day = incomplete_date.split("-")
36
+
37
+ month ||= "01"
38
+ day ||= "01"
39
+
40
+ DateTime.parse("#{year}-#{month}-#{day}").to_s
41
+ end
42
+
43
+ def build_term_object
44
+ set_ui_tag "#{term_id} (#{term_language})"
45
+ progress "Processing term #{term_id} (#{term_language})..."
46
+
47
+ split_definition
48
+
49
+ Glossarist::LocalizedConcept.new(term_hash)
50
+ end
51
+
52
+ def term_hash
53
+ dates = nil
54
+
55
+ if flesh_date(find_value_for("PUBLICATIONDATE"))
56
+ dates = [
57
+ {
58
+ type: :accepted,
59
+ date: flesh_date(find_value_for("PUBLICATIONDATE")),
60
+ },
61
+ {
62
+ type: :amended,
63
+ date: flesh_date(find_value_for("PUBLICATIONDATE")),
64
+ },
65
+ ]
66
+ end
67
+
68
+ {
69
+ id: term_id,
70
+ entry_status: extract_entry_status,
71
+ classification: extract_classification,
72
+ dates: dates,
73
+ review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
74
+ review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
75
+ review_decision_event: "published",
76
+ terms: extract_terms,
77
+ notes: extract_notes,
78
+ examples: extract_examples,
79
+ definition: [{ "content" => extract_definition_value }],
80
+ sources: extract_authoritative_source,
81
+ language_code: term_language,
82
+ related: extract_superseded_concepts,
83
+ }.compact
84
+ end
85
+
86
+ def term_id
87
+ @term_id ||= find_value_for("IEVREF")
88
+ end
89
+
90
+ def term_domain
91
+ @term_domain ||= term_id.slice(0, 3)
92
+ end
93
+
94
+ def term_language
95
+ @term_language ||= find_value_for("LANGUAGE").to_three_char_code
96
+ end
97
+
98
+ # Splits unified definition (from the spreadsheet) into separate
99
+ # definition, examples, and notes strings (for YAMLs).
100
+ #
101
+ # Sets +@definition+, +@examples+ and +@notes+ variables.
102
+ def split_definition
103
+ slicer_rx = %r{
104
+ \s*
105
+ (?:<p>\s*)?
106
+ (
107
+ (?<example>
108
+ # English example
109
+ \bEXAMPLE\b |
110
+ ^\bExamples\s+are\b: |
111
+ ^\bExamples\b: |
112
+ ^\bExample\b: |
113
+ # French examples
114
+ \bEXEMPLE\b |
115
+ ^\bExemples\b:
116
+ )
117
+ |
118
+ (?<note>
119
+ Note\s*\d+\sto\sentry: |
120
+ Note&nbsp;\d+\sto\sentry: |
121
+ Note\s*\d+\sto\sthe\sentry: |
122
+ Note\sto\sentry\s*\d+: |
123
+ Note\s*\d+?\sà\sl['’]article: |
124
+ <NOTE\/?>?\s*\d?\s+.*?– |
125
+ NOTE(?:\s+-)? |
126
+ Note\s+\d+\s– |
127
+ Note&nbsp;\d+\s
128
+ )
129
+ )
130
+ \s*
131
+ }x
132
+
133
+ @examples = []
134
+ @notes = []
135
+ definition_arr = [] # here array for consistent interface
136
+
137
+ next_part_arr = definition_arr
138
+ remaining_str = find_value_for("DEFINITION")
139
+
140
+ while md = remaining_str&.match(slicer_rx)
141
+ next_part = md.pre_match
142
+ next_part.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
143
+ next_part_arr.push(next_part)
144
+ next_part_arr = md[:example] ? @examples : @notes
145
+ # 112-03-17
146
+ # supplements the name of a quantity, especially for a component in a
147
+ # system, to indicate the quotient of that quantity by the total
148
+ # volume
149
+ # <NOTE – Examples: amount-of-substance volume concentration of
150
+ # component B (or concentration of B, in particular, ion
151
+ # concentration), molecular concentration of B, electron concentration
152
+ # (or electron density).
153
+ #
154
+ # In the above case the `Example` is part of the note but the regex
155
+ # above will capture it as an example and will add an empty `Note`
156
+ # and put the rest in an `Example`. So In this case we will replace
157
+ # the `Example` with `[:Example]` and revert it in the next iteration
158
+ # so it will not be caught by the regex.
159
+ remaining_str = md.post_match
160
+ remaining_str.sub!(/^Ex(a|e)mple/, "[:Ex\\1mple]") if md[:note]
161
+ end
162
+
163
+ remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
164
+ next_part_arr.push(remaining_str)
165
+ @definition = definition_arr.first
166
+ @definition = nil if @definition&.empty?
167
+ end
168
+
169
+ def extract_terms
170
+ [
171
+ extract_primary_designation,
172
+ *extract_synonymous_designations,
173
+ extract_international_symbol_designation,
174
+ ].compact
175
+ end
176
+
177
+ def extract_primary_designation
178
+ raw_term = find_value_for("TERM")
179
+ raw_term = "NA" if raw_term == "....."
180
+
181
+ build_expression_designation(
182
+ raw_term,
183
+ attribute_data: find_value_for("TERMATTRIBUTE"),
184
+ status: "preferred",
185
+ )
186
+ end
187
+
188
+ def extract_synonymous_designations
189
+ retval = (1..3).map do |num|
190
+ designations = find_value_for("SYNONYM#{num}") || ""
191
+
192
+ # Some synonyms have more than one entry
193
+ designations.split(/<[pbr]+>/).map do |raw_term|
194
+ build_expression_designation(
195
+ raw_term,
196
+ attribute_data: find_value_for("SYNONYM#{num}ATTRIBUTE"),
197
+ status: find_value_for("SYNONYM#{num}STATUS")&.downcase,
198
+ )
199
+ end
200
+ end
201
+
202
+ retval.flatten.compact
203
+ end
204
+
205
+ def extract_international_symbol_designation
206
+ raw_term = find_value_for("SYMBOLE")
207
+ raw_term && build_symbol_designation(raw_term)
208
+ end
209
+
210
+ def extract_definition_value
211
+ if @definition
212
+ IEV::Converter.mathml_to_asciimath(
213
+ replace_newlines(parse_anchor_tag(@definition, term_domain)),
214
+ ).strip
215
+ end
216
+ end
217
+
218
+ def extract_examples
219
+ @examples.map do |str|
220
+ IEV::Converter.mathml_to_asciimath(
221
+ replace_newlines(parse_anchor_tag(str, term_domain)),
222
+ ).strip
223
+ end
224
+ end
225
+
226
+ def extract_notes
227
+ @notes.map do |str|
228
+ IEV::Converter.mathml_to_asciimath(
229
+ replace_newlines(parse_anchor_tag(str, term_domain)),
230
+ ).strip
231
+ end
232
+ end
233
+
234
+ def extract_entry_status
235
+ case find_value_for("STATUS").downcase
236
+ when "standard" then "valid"
237
+ else nil
238
+ end
239
+ end
240
+
241
+ def extract_classification
242
+ classification_val = find_value_for("SYNONYM1STATUS")
243
+
244
+ case classification_val
245
+ when ""
246
+ "admitted"
247
+ when "认可的", "допустимый", "admitido"
248
+ "admitted"
249
+ when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
250
+ "preferred"
251
+ else
252
+ classification_val
253
+ end
254
+ end
255
+
256
+ def extract_authoritative_source
257
+ source_val = find_value_for("SOURCE")
258
+ return nil if source_val.nil?
259
+
260
+ SourceParser.new(source_val, term_domain)
261
+ .parsed_sources
262
+ .compact
263
+ .map do |source|
264
+ source.merge({ "type" => "authoritative" })
265
+ end
266
+ end
267
+
268
+ def extract_superseded_concepts
269
+ replaces_val = find_value_for("REPLACES")
270
+ return nil if replaces_val.nil?
271
+
272
+ SupersessionParser.new(replaces_val).supersessions
273
+ end
274
+
275
+ private
276
+
277
+ def build_expression_designation(raw_term, attribute_data:, status:)
278
+ term = IEV::Converter.mathml_to_asciimath(
279
+ parse_anchor_tag(raw_term, term_domain),
280
+ )
281
+ term_attributes = TermAttrsParser.new(attribute_data.to_s)
282
+
283
+ statuses = {
284
+ "obsoleto" => "deprecated",
285
+ "напуштен" => "deprecated",
286
+ }
287
+
288
+ {
289
+ "type" => "expression",
290
+ "prefix" => term_attributes.prefix,
291
+ "normative_status" => statuses[status] || status,
292
+ "usage_info" => term_attributes.usage_info,
293
+ "designation" => term,
294
+ "part_of_speech" => term_attributes.part_of_speech,
295
+ "geographical_area" => term_attributes.geographical_area,
296
+ "gender" => term_attributes.gender,
297
+ "plurality" => term_attributes.plurality,
298
+ }.compact
299
+ end
300
+
301
+ def build_symbol_designation(raw_term)
302
+ term = IEV::Converter.mathml_to_asciimath(
303
+ parse_anchor_tag(raw_term, term_domain),
304
+ )
305
+
306
+ {
307
+ "type" => "symbol",
308
+ "designation" => term,
309
+ "international" => true,
310
+ }.compact
311
+ end
312
+ end
313
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IEV
4
+ module Utilities
5
+ SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
6
+ FIGURE_ONE_REGEX =
7
+ "<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?"
8
+ FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}"
9
+ IMAGE_PATH_PREFIX = "image::/assets/images/parts"
10
+
11
+ def parse_anchor_tag(text, term_domain)
12
+ if text
13
+ # Convert IEV term references
14
+ # Convert href links
15
+ # Need to take care of this pattern:
16
+ # `inverse de la <a href="IEV103-06-01">période<a>`
17
+ text.gsub(
18
+ /<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
19
+ '{{\3, \1:\2}}',
20
+ ).gsub(
21
+ /<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
22
+ '{{\3, IEV:\2}}',
23
+ ).gsub(
24
+ # To handle <a> tags without ending tag like
25
+ # `Voir <a href=IEV103-05-21>IEV 103-05-21`
26
+ # for concept '702-03-11' in `fr`
27
+ /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
28
+ '{{\3, IEV:\2}}',
29
+ ).gsub(
30
+ /<a href="?([^<>]*?)"?>(.*?)<\/a>/,
31
+ '\1[\2]',
32
+ ).gsub(
33
+ Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_TWO_REGEX].join),
34
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
35
+ ).gsub(
36
+ Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_ONE_REGEX].join),
37
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
38
+ ).gsub(
39
+ /<img\s+([^<>]+?)\s*>/,
40
+ "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
41
+ ).gsub(
42
+ /<br>/,
43
+ "\n",
44
+ ).gsub(
45
+ /<b>(.*?)<\/b>/,
46
+ "*\\1*",
47
+ )
48
+ end
49
+ end
50
+
51
+ def replace_newlines(input)
52
+ input.gsub('\n', "\n\n")
53
+ .gsub(/<[pbr]+>/, "\n\n")
54
+ .gsub(/\s*\n[\n\s]+/, "\n\n")
55
+ .strip
56
+ end
57
+ end
58
+ end
data/lib/iev/version.rb CHANGED
@@ -1,3 +1,3 @@
1
- module Iev
2
- VERSION = "0.3.1".freeze
1
+ module IEV
2
+ VERSION = "0.3.3".freeze
3
3
  end
data/lib/iev.rb CHANGED
@@ -3,7 +3,27 @@ require "iev/db"
3
3
  require "open-uri"
4
4
  require "nokogiri"
5
5
 
6
- module Iev
6
+ require "benchmark"
7
+ require "creek"
8
+ require "unitsml"
9
+ require "plurimath"
10
+ require "glossarist"
11
+ require "relaton"
12
+ require "relaton_bib"
13
+ require "sequel"
14
+ require "thor"
15
+ require "yaml"
16
+ require "zeitwerk"
17
+
18
+ loader = Zeitwerk::Loader.for_gem
19
+ loader.inflector.inflect(
20
+ "cli" => "CLI",
21
+ "iev" => "IEV",
22
+ "ui" => "UI",
23
+ )
24
+ loader.setup
25
+
26
+ module IEV
7
27
  #
8
28
  # Scrape Electropedia for term.
9
29
  #
@@ -23,6 +43,8 @@ module Iev
23
43
  a = doc&.at(xpath)&.children&.to_xml
24
44
  a&.sub(%r{<br/>.*$}, "")
25
45
  &.sub(%r{, &lt;.*$}, "")
26
- &.gsub(%r{<[^>]*>}, "")&.strip
46
+ &.gsub(%r{<[^<>]*>}, "")&.strip
27
47
  end
28
48
  end
49
+
50
+ require "iev/cli"