iev 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +0 -2
- data/.github/workflows/release.yml +3 -1
- data/.gitignore +3 -1
- data/CLAUDE.md +50 -0
- data/Gemfile +3 -0
- data/README.adoc +65 -15
- data/exe/iev +11 -0
- data/iev.gemspec +5 -4
- data/lib/iev/cli/command.rb +119 -76
- data/lib/iev/cli/command_helper.rb +55 -36
- data/lib/iev/config.rb +31 -0
- data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
- data/lib/iev/data_source.rb +124 -0
- data/lib/iev/exporter.rb +122 -0
- data/lib/iev/scraper/page_parser.rb +176 -0
- data/lib/iev/scraper.rb +135 -0
- data/lib/iev/source_parser.rb +31 -18
- data/lib/iev/supersession_parser.rb +9 -13
- data/lib/iev/term_attrs_parser.rb +21 -7
- data/lib/iev/term_builder.rb +100 -94
- data/lib/iev/utilities.rb +91 -42
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +47 -35
- metadata +34 -13
- data/lib/iev/db.rb +0 -82
- data/lib/iev/db_cache.rb +0 -124
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
class Scraper
|
|
5
|
+
# Parses an Electropedia HTML page into a concept data hash.
|
|
6
|
+
#
|
|
7
|
+
# The Electropedia HTML structure is a table with rows for each language:
|
|
8
|
+
# - Language row: <div align="center"><font color="#800080">en</font></div>
|
|
9
|
+
# - Term cell: <b>term text</b> in the third <td>
|
|
10
|
+
# - Definition row: next row's third <td> (if present)
|
|
11
|
+
# - Empty/separator rows with <hr> or spacer images
|
|
12
|
+
class PageParser
|
|
13
|
+
# Map Electropedia HTML language codes to ISO 639-2/3 three-char codes.
|
|
14
|
+
LANG_CODE_MAP = {
|
|
15
|
+
"en" => "eng",
|
|
16
|
+
"fr" => "fra",
|
|
17
|
+
"ar" => "ara",
|
|
18
|
+
"de" => "deu",
|
|
19
|
+
"es" => "spa",
|
|
20
|
+
"it" => "ita",
|
|
21
|
+
"ko" => "kor",
|
|
22
|
+
"ja" => "jpn",
|
|
23
|
+
"pl" => "pol",
|
|
24
|
+
"pt" => "por",
|
|
25
|
+
"sr" => "srp",
|
|
26
|
+
"sv" => "swe",
|
|
27
|
+
"zh" => "zho",
|
|
28
|
+
"nl" => "nld",
|
|
29
|
+
"fi" => "fin",
|
|
30
|
+
"cs" => "ces",
|
|
31
|
+
"no" => "nor",
|
|
32
|
+
"ru" => "rus",
|
|
33
|
+
"sl" => "slv",
|
|
34
|
+
"sk" => "slk",
|
|
35
|
+
}.freeze
|
|
36
|
+
|
|
37
|
+
def initialize(doc, code)
|
|
38
|
+
@doc = doc
|
|
39
|
+
@code = code
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def parse
|
|
43
|
+
return nil unless find_iev_ref
|
|
44
|
+
|
|
45
|
+
{
|
|
46
|
+
"id" => @code,
|
|
47
|
+
"data" => {
|
|
48
|
+
"identifier" => @code,
|
|
49
|
+
"localized_concepts" => localized_concepts,
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def find_iev_ref
|
|
57
|
+
# Find the IEV reference cell to confirm the page is valid
|
|
58
|
+
@doc.at_css("b:contains('#{@code}')") ||
|
|
59
|
+
@doc.at_xpath("//td/b[contains(text(), '#{@code}')]")
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def localized_concepts
|
|
63
|
+
result = {}
|
|
64
|
+
lang_sections.each do |lang, term_row, def_row|
|
|
65
|
+
term = extract_term(term_row)
|
|
66
|
+
next unless term
|
|
67
|
+
|
|
68
|
+
entry = { "term" => term }
|
|
69
|
+
definition = extract_definition(def_row)
|
|
70
|
+
entry["definition"] = definition if definition
|
|
71
|
+
|
|
72
|
+
result[lang] = entry
|
|
73
|
+
end
|
|
74
|
+
result
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Finds all language sections in the table.
|
|
78
|
+
# Returns array of [lang_code, term_row, definition_row] tuples.
|
|
79
|
+
def lang_sections
|
|
80
|
+
sections = []
|
|
81
|
+
rows = content_rows
|
|
82
|
+
|
|
83
|
+
rows.each_with_index do |row, idx|
|
|
84
|
+
lang = extract_lang(row)
|
|
85
|
+
next unless lang
|
|
86
|
+
|
|
87
|
+
# The definition is in the next non-empty, non-separator row
|
|
88
|
+
def_row = find_definition_row(rows, idx + 1)
|
|
89
|
+
sections << [lang, row, def_row]
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
sections
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def content_rows
|
|
96
|
+
# Find the main content table (the one with language data)
|
|
97
|
+
# It's the largest table with IEV data
|
|
98
|
+
tables = @doc.css("table")
|
|
99
|
+
content_table = tables.max_by { |t| t.css("tr").length }
|
|
100
|
+
content_table ? content_table.css("tr").to_a : []
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def extract_lang(row)
|
|
104
|
+
font = row.at_css("div[align='center'] font[color='#800080']")
|
|
105
|
+
return nil unless font
|
|
106
|
+
|
|
107
|
+
lang_code = font.text.strip.downcase
|
|
108
|
+
LANG_CODE_MAP[lang_code]
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def extract_term(row)
|
|
112
|
+
# Term is in the third <td> — may be in a <b> tag (en, fr) or plain text
|
|
113
|
+
tds = row.css("td")
|
|
114
|
+
return nil if tds.length < 3
|
|
115
|
+
|
|
116
|
+
content_td = tds[2]
|
|
117
|
+
bold = content_td.at_css("b")
|
|
118
|
+
|
|
119
|
+
term = bold ? bold.text.strip : content_td.text.strip
|
|
120
|
+
term.empty? ? nil : term
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def extract_definition(row)
|
|
124
|
+
return nil unless row
|
|
125
|
+
|
|
126
|
+
tds = row.css("td")
|
|
127
|
+
return nil if tds.length < 3
|
|
128
|
+
|
|
129
|
+
content_td = tds[2]
|
|
130
|
+
# The definition is the text content, which may include MathML
|
|
131
|
+
html = content_td.inner_html.strip
|
|
132
|
+
return nil if html.empty? || html.match?(/\A<img.*ecblank/)
|
|
133
|
+
|
|
134
|
+
html
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Find the definition row following a language row.
|
|
138
|
+
# Skip separator rows (empty, <hr>, or spacer images).
|
|
139
|
+
def find_definition_row(rows, start_idx)
|
|
140
|
+
return nil if start_idx >= rows.length
|
|
141
|
+
|
|
142
|
+
row = rows[start_idx]
|
|
143
|
+
return nil if extract_lang(row)
|
|
144
|
+
return nil if separator?(row)
|
|
145
|
+
|
|
146
|
+
tds = row.css("td")
|
|
147
|
+
return nil if tds.length < 3
|
|
148
|
+
|
|
149
|
+
content = tds[2].inner_html.strip
|
|
150
|
+
return nil if content.empty?
|
|
151
|
+
|
|
152
|
+
# Skip rows that are only spacer images (unless they have <b> content)
|
|
153
|
+
if content.match?(/\A<img.*ecblank/) && !content.include?("<b>")
|
|
154
|
+
return nil
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
row
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def separator?(row)
|
|
161
|
+
tds = row.css("td")
|
|
162
|
+
return true if tds.any? { |td| td.at_css("hr") }
|
|
163
|
+
|
|
164
|
+
tds.all? { |td| spacer_only?(td) }
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def spacer_only?(cell)
|
|
168
|
+
html = cell.inner_html.strip
|
|
169
|
+
return true if html.empty?
|
|
170
|
+
return true if html.match?(/\A<img.*ecblank/)
|
|
171
|
+
|
|
172
|
+
cell.at_css("img[src*='ecblank']") && cell.text.strip.empty?
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
data/lib/iev/scraper.rb
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Scrapes IEV term data from Electropedia (electropedia.org).
|
|
5
|
+
#
|
|
6
|
+
# Electropedia is behind AWS WAF which requires JavaScript execution,
|
|
7
|
+
# so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# scraper = Iev::Scraper.new
|
|
11
|
+
# concept = scraper.fetch_concept("103-01-02")
|
|
12
|
+
# doc = scraper.fetch_page("103-01-02")
|
|
13
|
+
class Scraper
|
|
14
|
+
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
15
|
+
"display?openform&ievref="
|
|
16
|
+
|
|
17
|
+
# Pool of realistic Chrome User-Agent strings with matching platform hints.
|
|
18
|
+
# Rotated per request to reduce fingerprinting by AWS WAF.
|
|
19
|
+
USER_AGENT_PROFILES = [
|
|
20
|
+
{
|
|
21
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
22
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
23
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
24
|
+
platform: '"macOS"',
|
|
25
|
+
chrome_version: "131",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
29
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
30
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
31
|
+
platform: '"Windows"',
|
|
32
|
+
chrome_version: "130",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
36
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
37
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
38
|
+
platform: '"Linux"',
|
|
39
|
+
chrome_version: "131",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
43
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
44
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
45
|
+
platform: '"macOS"',
|
|
46
|
+
chrome_version: "129",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
50
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
51
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
52
|
+
platform: '"Windows"',
|
|
53
|
+
chrome_version: "131",
|
|
54
|
+
},
|
|
55
|
+
].freeze
|
|
56
|
+
|
|
57
|
+
def initialize(browser_opts: {})
|
|
58
|
+
@browser_opts = browser_opts
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Fetch the Electropedia page HTML for a given IEV code.
|
|
62
|
+
# Returns a Nokogiri document.
|
|
63
|
+
def fetch_page(code)
|
|
64
|
+
require "ferrum"
|
|
65
|
+
require "nokogiri"
|
|
66
|
+
|
|
67
|
+
url = "#{BASE_URL}#{code}"
|
|
68
|
+
browser = Ferrum::Browser.new(
|
|
69
|
+
headless: "new",
|
|
70
|
+
timeout: 30,
|
|
71
|
+
window_size: [1366, 768],
|
|
72
|
+
browser_options: {
|
|
73
|
+
"disable-blink-features" => "AutomationControlled",
|
|
74
|
+
},
|
|
75
|
+
**@browser_opts,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
browser.headers.set(random_headers)
|
|
79
|
+
browser.go_to(url)
|
|
80
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
81
|
+
html = browser.body
|
|
82
|
+
|
|
83
|
+
# Check if we got a real page or a WAF block
|
|
84
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
85
|
+
warn "IEV Scraper: AWS WAF blocked request for #{code}"
|
|
86
|
+
return nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
Nokogiri::HTML(html)
|
|
90
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
91
|
+
warn "IEV Scraper error for #{code}: #{e.message}"
|
|
92
|
+
nil
|
|
93
|
+
ensure
|
|
94
|
+
browser&.quit
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Fetch and parse concept data for an IEV code.
|
|
98
|
+
# Returns a hash with concept data or nil if not found.
|
|
99
|
+
def fetch_concept(code)
|
|
100
|
+
doc = fetch_page(code)
|
|
101
|
+
return nil unless doc
|
|
102
|
+
|
|
103
|
+
PageParser.new(doc, code).parse
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
def random_headers
|
|
109
|
+
profile = USER_AGENT_PROFILES.sample
|
|
110
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
111
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
112
|
+
"\"Not_A Brand\";v=\"24\""
|
|
113
|
+
|
|
114
|
+
{
|
|
115
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
116
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
117
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
118
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
119
|
+
"Cache-Control" => "no-cache",
|
|
120
|
+
"Pragma" => "no-cache",
|
|
121
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
122
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
123
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
124
|
+
"Sec-Fetch-Dest" => "document",
|
|
125
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
126
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
127
|
+
"Sec-Fetch-User" => "?1",
|
|
128
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
129
|
+
"User-Agent" => profile[:user_agent],
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
require_relative "scraper/page_parser"
|
data/lib/iev/source_parser.rb
CHANGED
|
@@ -11,7 +11,6 @@ module Iev
|
|
|
11
11
|
# @example
|
|
12
12
|
# SourceParser.new(cell_data_string).parsed_sources
|
|
13
13
|
class SourceParser
|
|
14
|
-
include Cli::Ui
|
|
15
14
|
include Utilities
|
|
16
15
|
using DataConversions
|
|
17
16
|
|
|
@@ -71,20 +70,25 @@ module Iev
|
|
|
71
70
|
end
|
|
72
71
|
|
|
73
72
|
def extract_single_source(raw_ref)
|
|
74
|
-
|
|
73
|
+
relationship = extract_source_relationship(raw_ref)
|
|
75
74
|
clean_ref = normalize_ref_string(raw_ref)
|
|
76
75
|
source_ref = extract_source_ref(clean_ref)
|
|
77
76
|
clause = extract_source_clause(clean_ref)
|
|
78
77
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
"original" => Iev::Converter.mathml_to_asciimath(
|
|
78
|
+
origin = Glossarist::Citation.new(
|
|
79
|
+
ref: source_ref,
|
|
80
|
+
locality: build_locality(clause),
|
|
81
|
+
link: obtain_source_link(source_ref),
|
|
82
|
+
original: Iev::Converter.mathml_to_asciimath(
|
|
85
83
|
parse_anchor_tag(raw_ref, @term_domain),
|
|
86
84
|
),
|
|
87
|
-
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
Glossarist::ConceptSource.new(
|
|
88
|
+
status: relationship[:status],
|
|
89
|
+
origin: origin,
|
|
90
|
+
modification: relationship[:modification],
|
|
91
|
+
)
|
|
88
92
|
rescue ::RelatonBib::RequestError => e
|
|
89
93
|
warn e.message
|
|
90
94
|
end
|
|
@@ -208,7 +212,6 @@ module Iev
|
|
|
208
212
|
/Constitution de l’Union internationale des télécommunications (UIT)/
|
|
209
213
|
"International Telecommunication Union (ITU) Constitution (Ed. 2015)"
|
|
210
214
|
else
|
|
211
|
-
debug :sources, "Failed to parse source: '#{str}'"
|
|
212
215
|
str
|
|
213
216
|
end
|
|
214
217
|
end
|
|
@@ -320,26 +323,36 @@ module Iev
|
|
|
320
323
|
|
|
321
324
|
case str
|
|
322
325
|
when /^MOD ([\d\-])/
|
|
323
|
-
{
|
|
324
|
-
"type" => type.to_s,
|
|
325
|
-
}
|
|
326
|
+
{ status: type.to_s }
|
|
326
327
|
when /(modified|modifié|modifiée|modifiés|MOD)\s*[–-]?\s+(.+)\Z/
|
|
327
328
|
{
|
|
328
|
-
|
|
329
|
-
|
|
329
|
+
status: type.to_s,
|
|
330
|
+
modification: Iev::Converter.mathml_to_asciimath(
|
|
330
331
|
parse_anchor_tag(::Regexp.last_match(2), @term_domain),
|
|
331
332
|
).strip,
|
|
332
333
|
}
|
|
333
334
|
else
|
|
334
|
-
{
|
|
335
|
-
"type" => type.to_s,
|
|
336
|
-
}
|
|
335
|
+
{ status: type.to_s }
|
|
337
336
|
end
|
|
338
337
|
end
|
|
339
338
|
|
|
339
|
+
def build_locality(clause)
|
|
340
|
+
return nil unless clause
|
|
341
|
+
|
|
342
|
+
Glossarist::Locality.new(
|
|
343
|
+
type: "clause",
|
|
344
|
+
reference_from: clause,
|
|
345
|
+
)
|
|
346
|
+
end
|
|
347
|
+
|
|
340
348
|
# Uses Relaton to obtain link for given source ref.
|
|
341
349
|
def obtain_source_link(ref)
|
|
350
|
+
return nil unless defined?(RelatonDb)
|
|
351
|
+
|
|
342
352
|
RelatonDb.instance.fetch(ref)&.url
|
|
353
|
+
rescue ::RelatonBib::RequestError => e
|
|
354
|
+
warn e.message
|
|
355
|
+
nil
|
|
343
356
|
end
|
|
344
357
|
end
|
|
345
358
|
end
|
|
@@ -9,8 +9,8 @@ module Iev
|
|
|
9
9
|
#
|
|
10
10
|
# @example
|
|
11
11
|
# SupersessionParser.new(cell_data_string).supersessions
|
|
12
|
+
# # => [Glossarist::RelatedConcept, ...]
|
|
12
13
|
class SupersessionParser
|
|
13
|
-
include Cli::Ui
|
|
14
14
|
using DataConversions
|
|
15
15
|
|
|
16
16
|
attr_reader :raw_str, :src_str, :supersessions
|
|
@@ -52,18 +52,14 @@ module Iev
|
|
|
52
52
|
end
|
|
53
53
|
|
|
54
54
|
def relation_from_match(match_data)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
"source" => "IEV",
|
|
64
|
-
"id" => match_data[:ref],
|
|
65
|
-
"version" => match_data[:version],
|
|
66
|
-
}
|
|
55
|
+
Glossarist::RelatedConcept.new(
|
|
56
|
+
type: "supersedes",
|
|
57
|
+
ref: Glossarist::Citation.new(
|
|
58
|
+
source: "IEV",
|
|
59
|
+
id: match_data[:ref],
|
|
60
|
+
version: match_data[:version],
|
|
61
|
+
),
|
|
62
|
+
)
|
|
67
63
|
end
|
|
68
64
|
end
|
|
69
65
|
end
|
|
@@ -13,7 +13,6 @@ module Iev
|
|
|
13
13
|
# parser.plurality # returns grammatical plurality
|
|
14
14
|
# parser.part_of_speech # returns part of speech
|
|
15
15
|
class TermAttrsParser
|
|
16
|
-
include Cli::Ui
|
|
17
16
|
using DataConversions
|
|
18
17
|
|
|
19
18
|
attr_reader :raw_str, :src_str, :gender, :geographical_area,
|
|
@@ -44,6 +43,19 @@ module Iev
|
|
|
44
43
|
"<ATTRIBUTES: #{src_str}>".freeze
|
|
45
44
|
end
|
|
46
45
|
|
|
46
|
+
# Constructs a Glossarist::Designation::GrammarInfo from the parsed
|
|
47
|
+
# gender, plurality, and part_of_speech attributes.
|
|
48
|
+
# Returns nil if none of these attributes were parsed.
|
|
49
|
+
def to_grammar_info
|
|
50
|
+
return nil unless gender || plurality || part_of_speech
|
|
51
|
+
|
|
52
|
+
Glossarist::Designation::GrammarInfo.new(
|
|
53
|
+
gender: gender ? [gender] : nil,
|
|
54
|
+
number: plurality ? [plurality] : nil,
|
|
55
|
+
part_of_speech: part_of_speech,
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
47
59
|
private
|
|
48
60
|
|
|
49
61
|
def parse
|
|
@@ -58,10 +70,7 @@ module Iev
|
|
|
58
70
|
|
|
59
71
|
return unless /\p{Word}/.match?(curr_str)
|
|
60
72
|
|
|
61
|
-
|
|
62
|
-
:term_attributes,
|
|
63
|
-
"Term attributes could not be parsed completely: '#{src_str}'",
|
|
64
|
-
)
|
|
73
|
+
# Term attributes could not be parsed completely
|
|
65
74
|
end
|
|
66
75
|
|
|
67
76
|
def extract_gender(str)
|
|
@@ -130,11 +139,16 @@ module Iev
|
|
|
130
139
|
\b
|
|
131
140
|
/x
|
|
132
141
|
|
|
133
|
-
|
|
142
|
+
removed = remove_from_string(str, prefix_rx)
|
|
143
|
+
@prefix = removed if removed
|
|
134
144
|
end
|
|
135
145
|
|
|
136
146
|
def decode_attrs_string(str)
|
|
137
|
-
str.decode_html || ""
|
|
147
|
+
decoded = str.decode_html || ""
|
|
148
|
+
# Strip common HTML inline tags that appear in TERMATTRIBUTE data
|
|
149
|
+
# and would interfere with usage_info angle-bracket parsing.
|
|
150
|
+
# Only strip known HTML tags, not usage_info like <telecommunications>.
|
|
151
|
+
decoded.gsub(/<\/?(?:sup|sub|i|b|em|strong|span|small)>/, "")
|
|
138
152
|
end
|
|
139
153
|
|
|
140
154
|
def remove_from_string(string, regexp)
|