iev 0.3.9 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -3
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +122 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +137 -159
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +138 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +39 -19
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +102 -94
- data/lib/iev/utilities.rb +129 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
class Scraper
|
|
5
|
+
# Parses an Electropedia HTML page into a concept data hash.
|
|
6
|
+
#
|
|
7
|
+
# The Electropedia HTML structure is a table with rows for each language:
|
|
8
|
+
# - Language row: <div align="center"><font color="#800080">en</font></div>
|
|
9
|
+
# - Term cell: <b>term text</b> in the third <td>
|
|
10
|
+
# - Definition row: next row's third <td> (if present)
|
|
11
|
+
# - Empty/separator rows with <hr> or spacer images
|
|
12
|
+
class PageParser
|
|
13
|
+
# Map Electropedia HTML language codes to ISO 639-2/3 three-char codes.
|
|
14
|
+
LANG_CODE_MAP = {
|
|
15
|
+
"en" => "eng",
|
|
16
|
+
"fr" => "fra",
|
|
17
|
+
"ar" => "ara",
|
|
18
|
+
"de" => "deu",
|
|
19
|
+
"es" => "spa",
|
|
20
|
+
"it" => "ita",
|
|
21
|
+
"ko" => "kor",
|
|
22
|
+
"ja" => "jpn",
|
|
23
|
+
"pl" => "pol",
|
|
24
|
+
"pt" => "por",
|
|
25
|
+
"sr" => "srp",
|
|
26
|
+
"sv" => "swe",
|
|
27
|
+
"zh" => "zho",
|
|
28
|
+
"nl" => "nld",
|
|
29
|
+
"fi" => "fin",
|
|
30
|
+
"cs" => "ces",
|
|
31
|
+
"no" => "nor",
|
|
32
|
+
"ru" => "rus",
|
|
33
|
+
"sl" => "slv",
|
|
34
|
+
"sk" => "slk",
|
|
35
|
+
}.freeze
|
|
36
|
+
|
|
37
|
+
def initialize(doc, code)
|
|
38
|
+
@doc = doc
|
|
39
|
+
@code = code
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def parse
|
|
43
|
+
return nil unless find_iev_ref
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
"id" => @code,
|
|
47
|
+
"data" => {
|
|
48
|
+
"identifier" => @code,
|
|
49
|
+
"localized_concepts" => localized_concepts,
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def find_iev_ref
|
|
57
|
+
# Find the IEV reference cell to confirm the page is valid
|
|
58
|
+
@doc.at_css("b:contains('#{@code}')") ||
|
|
59
|
+
@doc.at_xpath("//td/b[contains(text(), '#{@code}')]")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def localized_concepts
|
|
63
|
+
result = {}
|
|
64
|
+
lang_sections.each do |lang, term_row, def_row|
|
|
65
|
+
term = extract_term(term_row)
|
|
66
|
+
next unless term
|
|
67
|
+
|
|
68
|
+
entry = { "term" => term }
|
|
69
|
+
definition = extract_definition(def_row)
|
|
70
|
+
entry["definition"] = definition if definition
|
|
71
|
+
|
|
72
|
+
result[lang] = entry
|
|
73
|
+
end
|
|
74
|
+
result
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Finds all language sections in the table.
|
|
78
|
+
# Returns array of [lang_code, term_row, definition_row] tuples.
|
|
79
|
+
def lang_sections
|
|
80
|
+
sections = []
|
|
81
|
+
rows = content_rows
|
|
82
|
+
|
|
83
|
+
rows.each_with_index do |row, idx|
|
|
84
|
+
lang = extract_lang(row)
|
|
85
|
+
next unless lang
|
|
86
|
+
|
|
87
|
+
# The definition is in the next non-empty, non-separator row
|
|
88
|
+
def_row = find_definition_row(rows, idx + 1)
|
|
89
|
+
sections << [lang, row, def_row]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
sections
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def content_rows
|
|
96
|
+
# Find the main content table (the one with language data)
|
|
97
|
+
# It's the largest table with IEV data
|
|
98
|
+
tables = @doc.css("table")
|
|
99
|
+
content_table = tables.max_by { |t| t.css("tr").length }
|
|
100
|
+
content_table ? content_table.css("tr").to_a : []
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def extract_lang(row)
|
|
104
|
+
font = row.at_css("div[align='center'] font[color='#800080']")
|
|
105
|
+
return nil unless font
|
|
106
|
+
|
|
107
|
+
lang_code = font.text.strip.downcase
|
|
108
|
+
LANG_CODE_MAP[lang_code]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def extract_term(row)
|
|
112
|
+
# Term is in the third <td> — may be in a <b> tag (en, fr) or plain text
|
|
113
|
+
tds = row.css("td")
|
|
114
|
+
return nil if tds.length < 3
|
|
115
|
+
|
|
116
|
+
content_td = tds[2]
|
|
117
|
+
bold = content_td.at_css("b")
|
|
118
|
+
|
|
119
|
+
term = bold ? bold.text.strip : content_td.text.strip
|
|
120
|
+
term.empty? ? nil : term
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def extract_definition(row)
|
|
124
|
+
return nil unless row
|
|
125
|
+
|
|
126
|
+
tds = row.css("td")
|
|
127
|
+
return nil if tds.length < 3
|
|
128
|
+
|
|
129
|
+
content_td = tds[2]
|
|
130
|
+
# The definition is the text content, which may include MathML
|
|
131
|
+
html = content_td.inner_html.strip
|
|
132
|
+
return nil if html.empty? || html.match?(/\A<img.*ecblank/)
|
|
133
|
+
|
|
134
|
+
html
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Find the definition row following a language row.
|
|
138
|
+
# Skip separator rows (empty, <hr>, or spacer images).
|
|
139
|
+
def find_definition_row(rows, start_idx)
|
|
140
|
+
return nil if start_idx >= rows.length
|
|
141
|
+
|
|
142
|
+
row = rows[start_idx]
|
|
143
|
+
return nil if extract_lang(row)
|
|
144
|
+
return nil if separator?(row)
|
|
145
|
+
|
|
146
|
+
tds = row.css("td")
|
|
147
|
+
return nil if tds.length < 3
|
|
148
|
+
|
|
149
|
+
content = tds[2].inner_html.strip
|
|
150
|
+
return nil if content.empty?
|
|
151
|
+
|
|
152
|
+
# Skip rows that are only spacer images (unless they have <b> content)
|
|
153
|
+
if content.match?(/\A<img.*ecblank/) && !content.include?("<b>")
|
|
154
|
+
return nil
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
row
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def separator?(row)
|
|
161
|
+
tds = row.css("td")
|
|
162
|
+
return true if tds.any? { |td| td.at_css("hr") }
|
|
163
|
+
|
|
164
|
+
tds.all? { |td| spacer_only?(td) }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def spacer_only?(cell)
|
|
168
|
+
html = cell.inner_html.strip
|
|
169
|
+
return true if html.empty?
|
|
170
|
+
return true if html.match?(/\A<img.*ecblank/)
|
|
171
|
+
|
|
172
|
+
cell.at_css("img[src*='ecblank']") && cell.text.strip.empty?
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
data/lib/iev/scraper.rb
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Scrapes IEV term data from Electropedia (electropedia.org).
|
|
5
|
+
#
|
|
6
|
+
# Electropedia is behind AWS WAF which requires JavaScript execution,
|
|
7
|
+
# so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# scraper = Iev::Scraper.new
|
|
11
|
+
# concept = scraper.fetch_concept("103-01-02")
|
|
12
|
+
# doc = scraper.fetch_page("103-01-02")
|
|
13
|
+
class Scraper
|
|
14
|
+
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
15
|
+
"display?openform&ievref="
|
|
16
|
+
|
|
17
|
+
# Pool of realistic Chrome User-Agent strings with matching platform hints.
|
|
18
|
+
# Rotated per request to reduce fingerprinting by AWS WAF.
|
|
19
|
+
USER_AGENT_PROFILES = [
|
|
20
|
+
{
|
|
21
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
22
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
23
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
24
|
+
platform: '"macOS"',
|
|
25
|
+
chrome_version: "131",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
29
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
30
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
31
|
+
platform: '"Windows"',
|
|
32
|
+
chrome_version: "130",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
36
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
37
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
38
|
+
platform: '"Linux"',
|
|
39
|
+
chrome_version: "131",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
43
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
44
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
45
|
+
platform: '"macOS"',
|
|
46
|
+
chrome_version: "129",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
50
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
51
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
52
|
+
platform: '"Windows"',
|
|
53
|
+
chrome_version: "131",
|
|
54
|
+
},
|
|
55
|
+
].freeze
|
|
56
|
+
|
|
57
|
+
def initialize(browser_opts: {})
|
|
58
|
+
@browser_opts = browser_opts
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Fetch the Electropedia page HTML for a given IEV code.
|
|
62
|
+
# Returns a Nokogiri document.
|
|
63
|
+
def fetch_page(code)
|
|
64
|
+
require "ferrum"
|
|
65
|
+
require "nokogiri"
|
|
66
|
+
|
|
67
|
+
url = "#{BASE_URL}#{code}"
|
|
68
|
+
browser = Ferrum::Browser.new(
|
|
69
|
+
headless: "new",
|
|
70
|
+
timeout: 30,
|
|
71
|
+
window_size: [1366, 768],
|
|
72
|
+
browser_options: {
|
|
73
|
+
"disable-blink-features" => "AutomationControlled",
|
|
74
|
+
},
|
|
75
|
+
**@browser_opts,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
browser.headers.set(random_headers)
|
|
79
|
+
browser.go_to(url)
|
|
80
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
81
|
+
html = browser.body
|
|
82
|
+
|
|
83
|
+
# Check if we got a real page or a WAF block
|
|
84
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
85
|
+
warn "IEV Scraper: AWS WAF blocked request for #{code}"
|
|
86
|
+
return nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
Nokogiri::HTML(html)
|
|
90
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
91
|
+
warn "IEV Scraper error for #{code}: #{e.message}"
|
|
92
|
+
nil
|
|
93
|
+
ensure
|
|
94
|
+
browser&.quit
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Fetch and parse concept data for an IEV code.
|
|
98
|
+
# Returns a hash with concept data or nil if not found.
|
|
99
|
+
def fetch_concept(code)
|
|
100
|
+
doc = fetch_page(code)
|
|
101
|
+
return nil unless doc
|
|
102
|
+
|
|
103
|
+
PageParser.new(doc, code).parse
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
def random_headers
|
|
109
|
+
profile = USER_AGENT_PROFILES.sample
|
|
110
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
111
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
112
|
+
"\"Not_A Brand\";v=\"24\""
|
|
113
|
+
|
|
114
|
+
{
|
|
115
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
116
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
117
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
118
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
119
|
+
"Cache-Control" => "no-cache",
|
|
120
|
+
"Pragma" => "no-cache",
|
|
121
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
122
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
123
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
124
|
+
"Sec-Fetch-Dest" => "document",
|
|
125
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
126
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
127
|
+
"Sec-Fetch-User" => "?1",
|
|
128
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
129
|
+
"User-Agent" => profile[:user_agent],
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
require_relative "scraper/page_parser"
|
data/lib/iev/source_parser.rb
CHANGED
|
@@ -11,10 +11,15 @@ module Iev
|
|
|
11
11
|
# @example
|
|
12
12
|
# SourceParser.new(cell_data_string).parsed_sources
|
|
13
13
|
class SourceParser
|
|
14
|
-
include Cli::Ui
|
|
15
14
|
include Utilities
|
|
16
15
|
using DataConversions
|
|
17
16
|
|
|
17
|
+
# When false, obtain_source_link skips Relaton network calls.
|
|
18
|
+
@relaton_enabled = true
|
|
19
|
+
class << self
|
|
20
|
+
attr_accessor :relaton_enabled
|
|
21
|
+
end
|
|
22
|
+
|
|
18
23
|
attr_reader :src_split, :parsed_sources, :raw_str, :src_str
|
|
19
24
|
|
|
20
25
|
def initialize(source_str, term_domain)
|
|
@@ -71,21 +76,26 @@ module Iev
|
|
|
71
76
|
end
|
|
72
77
|
|
|
73
78
|
def extract_single_source(raw_ref)
|
|
74
|
-
|
|
79
|
+
relationship = extract_source_relationship(raw_ref)
|
|
75
80
|
clean_ref = normalize_ref_string(raw_ref)
|
|
76
81
|
source_ref = extract_source_ref(clean_ref)
|
|
77
82
|
clause = extract_source_clause(clean_ref)
|
|
78
83
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"original" => Iev::Converter.mathml_to_asciimath(
|
|
84
|
+
origin = Glossarist::Citation.new(
|
|
85
|
+
ref: source_ref,
|
|
86
|
+
locality: build_locality(clause),
|
|
87
|
+
link: obtain_source_link(source_ref),
|
|
88
|
+
original: Iev::Converter.mathml_to_asciimath(
|
|
85
89
|
parse_anchor_tag(raw_ref, @term_domain),
|
|
86
90
|
),
|
|
87
|
-
|
|
88
|
-
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
Glossarist::ConceptSource.new(
|
|
94
|
+
status: relationship[:status],
|
|
95
|
+
origin: origin,
|
|
96
|
+
modification: relationship[:modification],
|
|
97
|
+
)
|
|
98
|
+
rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
|
|
89
99
|
warn e.message
|
|
90
100
|
end
|
|
91
101
|
|
|
@@ -208,7 +218,6 @@ module Iev
|
|
|
208
218
|
/Constitution de l’Union internationale des télécommunications (UIT)/
|
|
209
219
|
"International Telecommunication Union (ITU) Constitution (Ed. 2015)"
|
|
210
220
|
else
|
|
211
|
-
debug :sources, "Failed to parse source: '#{str}'"
|
|
212
221
|
str
|
|
213
222
|
end
|
|
214
223
|
end
|
|
@@ -320,26 +329,37 @@ module Iev
|
|
|
320
329
|
|
|
321
330
|
case str
|
|
322
331
|
when /^MOD ([\d\-])/
|
|
323
|
-
{
|
|
324
|
-
"type" => type.to_s,
|
|
325
|
-
}
|
|
332
|
+
{ status: type.to_s }
|
|
326
333
|
when /(modified|modifié|modifiée|modifiés|MOD)\s*[–-]?\s+(.+)\Z/
|
|
327
334
|
{
|
|
328
|
-
|
|
329
|
-
|
|
335
|
+
status: type.to_s,
|
|
336
|
+
modification: Iev::Converter.mathml_to_asciimath(
|
|
330
337
|
parse_anchor_tag(::Regexp.last_match(2), @term_domain),
|
|
331
338
|
).strip,
|
|
332
339
|
}
|
|
333
340
|
else
|
|
334
|
-
{
|
|
335
|
-
"type" => type.to_s,
|
|
336
|
-
}
|
|
341
|
+
{ status: type.to_s }
|
|
337
342
|
end
|
|
338
343
|
end
|
|
339
344
|
|
|
345
|
+
def build_locality(clause)
|
|
346
|
+
return nil unless clause
|
|
347
|
+
|
|
348
|
+
Glossarist::Locality.new(
|
|
349
|
+
type: "clause",
|
|
350
|
+
reference_from: clause,
|
|
351
|
+
)
|
|
352
|
+
end
|
|
353
|
+
|
|
340
354
|
# Uses Relaton to obtain link for given source ref.
|
|
341
355
|
def obtain_source_link(ref)
|
|
356
|
+
return nil unless self.class.relaton_enabled
|
|
357
|
+
return nil unless defined?(RelatonDb)
|
|
358
|
+
|
|
342
359
|
RelatonDb.instance.fetch(ref)&.url
|
|
360
|
+
rescue ::RelatonBib::RequestError, Socket::ResolutionError, SocketError => e
|
|
361
|
+
warn e.message
|
|
362
|
+
nil
|
|
343
363
|
end
|
|
344
364
|
end
|
|
345
365
|
end
|
|
@@ -9,8 +9,8 @@ module Iev
|
|
|
9
9
|
#
|
|
10
10
|
# @example
|
|
11
11
|
# SupersessionParser.new(cell_data_string).supersessions
|
|
12
|
+
# # => [Glossarist::RelatedConcept, ...]
|
|
12
13
|
class SupersessionParser
|
|
13
|
-
include Cli::Ui
|
|
14
14
|
using DataConversions
|
|
15
15
|
|
|
16
16
|
attr_reader :raw_str, :src_str, :supersessions
|
|
@@ -52,18 +52,14 @@ module Iev
|
|
|
52
52
|
end
|
|
53
53
|
|
|
54
54
|
def relation_from_match(match_data)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"source" => "IEV",
|
|
64
|
-
"id" => match_data[:ref],
|
|
65
|
-
"version" => match_data[:version],
|
|
66
|
-
}
|
|
55
|
+
Glossarist::RelatedConcept.new(
|
|
56
|
+
type: "supersedes",
|
|
57
|
+
ref: Glossarist::Citation.new(
|
|
58
|
+
source: "IEV",
|
|
59
|
+
id: match_data[:ref],
|
|
60
|
+
version: match_data[:version],
|
|
61
|
+
),
|
|
62
|
+
)
|
|
67
63
|
end
|
|
68
64
|
end
|
|
69
65
|
end
|
|
@@ -13,7 +13,6 @@ module Iev
|
|
|
13
13
|
# parser.plurality # returns grammatical plurality
|
|
14
14
|
# parser.part_of_speech # returns part of speech
|
|
15
15
|
class TermAttrsParser
|
|
16
|
-
include Cli::Ui
|
|
17
16
|
using DataConversions
|
|
18
17
|
|
|
19
18
|
attr_reader :raw_str, :src_str, :gender, :geographical_area,
|
|
@@ -44,6 +43,19 @@ module Iev
|
|
|
44
43
|
"<ATTRIBUTES: #{src_str}>".freeze
|
|
45
44
|
end
|
|
46
45
|
|
|
46
|
+
# Constructs a Glossarist::Designation::GrammarInfo from the parsed
|
|
47
|
+
# gender, plurality, and part_of_speech attributes.
|
|
48
|
+
# Returns nil if none of these attributes were parsed.
|
|
49
|
+
def to_grammar_info
|
|
50
|
+
return nil unless gender || plurality || part_of_speech
|
|
51
|
+
|
|
52
|
+
Glossarist::Designation::GrammarInfo.new(
|
|
53
|
+
gender: gender ? [gender] : nil,
|
|
54
|
+
number: plurality ? [plurality] : nil,
|
|
55
|
+
part_of_speech: part_of_speech,
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
47
59
|
private
|
|
48
60
|
|
|
49
61
|
def parse
|
|
@@ -58,10 +70,7 @@ module Iev
|
|
|
58
70
|
|
|
59
71
|
return unless /\p{Word}/.match?(curr_str)
|
|
60
72
|
|
|
61
|
-
|
|
62
|
-
:term_attributes,
|
|
63
|
-
"Term attributes could not be parsed completely: '#{src_str}'",
|
|
64
|
-
)
|
|
73
|
+
# Term attributes could not be parsed completely
|
|
65
74
|
end
|
|
66
75
|
|
|
67
76
|
def extract_gender(str)
|
|
@@ -130,11 +139,16 @@ module Iev
|
|
|
130
139
|
\b
|
|
131
140
|
/x
|
|
132
141
|
|
|
133
|
-
|
|
142
|
+
removed = remove_from_string(str, prefix_rx)
|
|
143
|
+
@prefix = removed if removed
|
|
134
144
|
end
|
|
135
145
|
|
|
136
146
|
def decode_attrs_string(str)
|
|
137
|
-
str.decode_html || ""
|
|
147
|
+
decoded = str.decode_html || ""
|
|
148
|
+
# Strip common HTML inline tags that appear in TERMATTRIBUTE data
|
|
149
|
+
# and would interfere with usage_info angle-bracket parsing.
|
|
150
|
+
# Only strip known HTML tags, not usage_info like <telecommunications>.
|
|
151
|
+
decoded.gsub(/<\/?(?:sup|sub|i|b|em|strong|span|small)>/, "")
|
|
138
152
|
end
|
|
139
153
|
|
|
140
154
|
def remove_from_string(string, regexp)
|