iev 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,176 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ class Scraper
5
+ # Parses an Electropedia HTML page into a concept data hash.
6
+ #
7
+ # The Electropedia HTML structure is a table with rows for each language:
8
+ # - Language row: <div align="center"><font color="#800080">en</font></div>
9
+ # - Term cell: <b>term text</b> in the third <td>
10
+ # - Definition row: next row's third <td> (if present)
11
+ # - Empty/separator rows with <hr> or spacer images
12
+ class PageParser
13
+ # Map Electropedia HTML language codes to ISO 639-2/3 three-char codes.
14
+ LANG_CODE_MAP = {
15
+ "en" => "eng",
16
+ "fr" => "fra",
17
+ "ar" => "ara",
18
+ "de" => "deu",
19
+ "es" => "spa",
20
+ "it" => "ita",
21
+ "ko" => "kor",
22
+ "ja" => "jpn",
23
+ "pl" => "pol",
24
+ "pt" => "por",
25
+ "sr" => "srp",
26
+ "sv" => "swe",
27
+ "zh" => "zho",
28
+ "nl" => "nld",
29
+ "fi" => "fin",
30
+ "cs" => "ces",
31
+ "no" => "nor",
32
+ "ru" => "rus",
33
+ "sl" => "slv",
34
+ "sk" => "slk",
35
+ }.freeze
36
+
37
+ def initialize(doc, code)
38
+ @doc = doc
39
+ @code = code
40
+ end
41
+
42
+ def parse
43
+ return nil unless find_iev_ref
44
+
45
+ {
46
+ "id" => @code,
47
+ "data" => {
48
+ "identifier" => @code,
49
+ "localized_concepts" => localized_concepts,
50
+ },
51
+ }
52
+ end
53
+
54
+ private
55
+
56
+ def find_iev_ref
57
+ # Find the IEV reference cell to confirm the page is valid
58
+ @doc.at_css("b:contains('#{@code}')") ||
59
+ @doc.at_xpath("//td/b[contains(text(), '#{@code}')]")
60
+ end
61
+
62
+ def localized_concepts
63
+ result = {}
64
+ lang_sections.each do |lang, term_row, def_row|
65
+ term = extract_term(term_row)
66
+ next unless term
67
+
68
+ entry = { "term" => term }
69
+ definition = extract_definition(def_row)
70
+ entry["definition"] = definition if definition
71
+
72
+ result[lang] = entry
73
+ end
74
+ result
75
+ end
76
+
77
+ # Finds all language sections in the table.
78
+ # Returns array of [lang_code, term_row, definition_row] tuples.
79
+ def lang_sections
80
+ sections = []
81
+ rows = content_rows
82
+
83
+ rows.each_with_index do |row, idx|
84
+ lang = extract_lang(row)
85
+ next unless lang
86
+
87
+ # The definition is in the next non-empty, non-separator row
88
+ def_row = find_definition_row(rows, idx + 1)
89
+ sections << [lang, row, def_row]
90
+ end
91
+
92
+ sections
93
+ end
94
+
95
+ def content_rows
96
+ # Find the main content table (the one with language data)
97
+ # It's the largest table with IEV data
98
+ tables = @doc.css("table")
99
+ content_table = tables.max_by { |t| t.css("tr").length }
100
+ content_table ? content_table.css("tr").to_a : []
101
+ end
102
+
103
+ def extract_lang(row)
104
+ font = row.at_css("div[align='center'] font[color='#800080']")
105
+ return nil unless font
106
+
107
+ lang_code = font.text.strip.downcase
108
+ LANG_CODE_MAP[lang_code]
109
+ end
110
+
111
+ def extract_term(row)
112
+ # Term is in the third <td> — may be in a <b> tag (en, fr) or plain text
113
+ tds = row.css("td")
114
+ return nil if tds.length < 3
115
+
116
+ content_td = tds[2]
117
+ bold = content_td.at_css("b")
118
+
119
+ term = bold ? bold.text.strip : content_td.text.strip
120
+ term.empty? ? nil : term
121
+ end
122
+
123
+ def extract_definition(row)
124
+ return nil unless row
125
+
126
+ tds = row.css("td")
127
+ return nil if tds.length < 3
128
+
129
+ content_td = tds[2]
130
+ # The definition is the text content, which may include MathML
131
+ html = content_td.inner_html.strip
132
+ return nil if html.empty? || html.match?(/\A<img.*ecblank/)
133
+
134
+ html
135
+ end
136
+
137
+ # Find the definition row following a language row.
138
+ # Skip separator rows (empty, <hr>, or spacer images).
139
+ def find_definition_row(rows, start_idx)
140
+ return nil if start_idx >= rows.length
141
+
142
+ row = rows[start_idx]
143
+ return nil if extract_lang(row)
144
+ return nil if separator?(row)
145
+
146
+ tds = row.css("td")
147
+ return nil if tds.length < 3
148
+
149
+ content = tds[2].inner_html.strip
150
+ return nil if content.empty?
151
+
152
+ # Skip rows that are only spacer images (unless they have <b> content)
153
+ if content.match?(/\A<img.*ecblank/) && !content.include?("<b>")
154
+ return nil
155
+ end
156
+
157
+ row
158
+ end
159
+
160
+ def separator?(row)
161
+ tds = row.css("td")
162
+ return true if tds.any? { |td| td.at_css("hr") }
163
+
164
+ tds.all? { |td| spacer_only?(td) }
165
+ end
166
+
167
+ def spacer_only?(cell)
168
+ html = cell.inner_html.strip
169
+ return true if html.empty?
170
+ return true if html.match?(/\A<img.*ecblank/)
171
+
172
+ cell.at_css("img[src*='ecblank']") && cell.text.strip.empty?
173
+ end
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Scrapes IEV term data from Electropedia (electropedia.org).
5
+ #
6
+ # Electropedia is behind AWS WAF which requires JavaScript execution,
7
+ # so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
8
+ #
9
+ # @example
10
+ # scraper = Iev::Scraper.new
11
+ # concept = scraper.fetch_concept("103-01-02")
12
+ # doc = scraper.fetch_page("103-01-02")
13
+ class Scraper
14
+ BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
15
+ "display?openform&ievref="
16
+
17
+ # Pool of realistic Chrome User-Agent strings with matching platform hints.
18
+ # Rotated per request to reduce fingerprinting by AWS WAF.
19
+ USER_AGENT_PROFILES = [
20
+ {
21
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
22
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
23
+ "Chrome/131.0.0.0 Safari/537.36",
24
+ platform: '"macOS"',
25
+ chrome_version: "131",
26
+ },
27
+ {
28
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
29
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
30
+ "Chrome/130.0.0.0 Safari/537.36",
31
+ platform: '"Windows"',
32
+ chrome_version: "130",
33
+ },
34
+ {
35
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
36
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
37
+ "Chrome/131.0.0.0 Safari/537.36",
38
+ platform: '"Linux"',
39
+ chrome_version: "131",
40
+ },
41
+ {
42
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
43
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
44
+ "Chrome/129.0.0.0 Safari/537.36",
45
+ platform: '"macOS"',
46
+ chrome_version: "129",
47
+ },
48
+ {
49
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
50
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
51
+ "Chrome/131.0.0.0 Safari/537.36",
52
+ platform: '"Windows"',
53
+ chrome_version: "131",
54
+ },
55
+ ].freeze
56
+
57
+ def initialize(browser_opts: {})
58
+ @browser_opts = browser_opts
59
+ end
60
+
61
+ # Fetch the Electropedia page HTML for a given IEV code.
62
+ # Returns a Nokogiri document.
63
+ def fetch_page(code)
64
+ require "ferrum"
65
+ require "nokogiri"
66
+
67
+ url = "#{BASE_URL}#{code}"
68
+ browser = Ferrum::Browser.new(
69
+ headless: "new",
70
+ timeout: 30,
71
+ window_size: [1366, 768],
72
+ browser_options: {
73
+ "disable-blink-features" => "AutomationControlled",
74
+ },
75
+ **@browser_opts,
76
+ )
77
+
78
+ browser.headers.set(random_headers)
79
+ browser.go_to(url)
80
+ browser.network.wait_for_idle(timeout: 15)
81
+ html = browser.body
82
+
83
+ # Check if we got a real page or a WAF block
84
+ if html.include?("403 ERROR") || html.include?("Request blocked")
85
+ warn "IEV Scraper: AWS WAF blocked request for #{code}"
86
+ return nil
87
+ end
88
+
89
+ Nokogiri::HTML(html)
90
+ rescue Ferrum::Error, Ferrum::BrowserError => e
91
+ warn "IEV Scraper error for #{code}: #{e.message}"
92
+ nil
93
+ ensure
94
+ browser&.quit
95
+ end
96
+
97
+ # Fetch and parse concept data for an IEV code.
98
+ # Returns a hash with concept data or nil if not found.
99
+ def fetch_concept(code)
100
+ doc = fetch_page(code)
101
+ return nil unless doc
102
+
103
+ PageParser.new(doc, code).parse
104
+ end
105
+
106
+ private
107
+
108
+ def random_headers
109
+ profile = USER_AGENT_PROFILES.sample
110
+ sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
111
+ "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
112
+ "\"Not_A Brand\";v=\"24\""
113
+
114
+ {
115
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
116
+ "image/avif,image/webp,image/apng,*/*;q=0.8," \
117
+ "application/signed-exchange;v=b3;q=0.7",
118
+ "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
119
+ "Cache-Control" => "no-cache",
120
+ "Pragma" => "no-cache",
121
+ "Sec-Ch-Ua" => sec_ch_ua,
122
+ "Sec-Ch-Ua-Mobile" => "?0",
123
+ "Sec-Ch-Ua-Platform" => profile[:platform],
124
+ "Sec-Fetch-Dest" => "document",
125
+ "Sec-Fetch-Mode" => "navigate",
126
+ "Sec-Fetch-Site" => "cross-site",
127
+ "Sec-Fetch-User" => "?1",
128
+ "Upgrade-Insecure-Requests" => "1",
129
+ "User-Agent" => profile[:user_agent],
130
+ }
131
+ end
132
+ end
133
+ end
134
+
135
+ require_relative "scraper/page_parser"
@@ -11,7 +11,6 @@ module Iev
11
11
  # @example
12
12
  # SourceParser.new(cell_data_string).parsed_sources
13
13
  class SourceParser
14
- include Cli::Ui
15
14
  include Utilities
16
15
  using DataConversions
17
16
 
@@ -71,20 +70,25 @@ module Iev
71
70
  end
72
71
 
73
72
  def extract_single_source(raw_ref)
74
- relation_type = extract_source_relationship(raw_ref)
73
+ relationship = extract_source_relationship(raw_ref)
75
74
  clean_ref = normalize_ref_string(raw_ref)
76
75
  source_ref = extract_source_ref(clean_ref)
77
76
  clause = extract_source_clause(clean_ref)
78
77
 
79
- {
80
- "ref" => source_ref,
81
- "clause" => clause,
82
- "link" => obtain_source_link(source_ref),
83
- "relationship" => relation_type,
84
- "original" => Iev::Converter.mathml_to_asciimath(
78
+ origin = Glossarist::Citation.new(
79
+ ref: source_ref,
80
+ locality: build_locality(clause),
81
+ link: obtain_source_link(source_ref),
82
+ original: Iev::Converter.mathml_to_asciimath(
85
83
  parse_anchor_tag(raw_ref, @term_domain),
86
84
  ),
87
- }.compact
85
+ )
86
+
87
+ Glossarist::ConceptSource.new(
88
+ status: relationship[:status],
89
+ origin: origin,
90
+ modification: relationship[:modification],
91
+ )
88
92
  rescue ::RelatonBib::RequestError => e
89
93
  warn e.message
90
94
  end
@@ -208,7 +212,6 @@ module Iev
208
212
  /Constitution de l’Union internationale des télécommunications (UIT)/
209
213
  "International Telecommunication Union (ITU) Constitution (Ed. 2015)"
210
214
  else
211
- debug :sources, "Failed to parse source: '#{str}'"
212
215
  str
213
216
  end
214
217
  end
@@ -320,26 +323,36 @@ module Iev
320
323
 
321
324
  case str
322
325
  when /^MOD ([\d\-])/
323
- {
324
- "type" => type.to_s,
325
- }
326
+ { status: type.to_s }
326
327
  when /(modified|modifié|modifiée|modifiés|MOD)\s*[–-]?\s+(.+)\Z/
327
328
  {
328
- "type" => type.to_s,
329
- "modification" => Iev::Converter.mathml_to_asciimath(
329
+ status: type.to_s,
330
+ modification: Iev::Converter.mathml_to_asciimath(
330
331
  parse_anchor_tag(::Regexp.last_match(2), @term_domain),
331
332
  ).strip,
332
333
  }
333
334
  else
334
- {
335
- "type" => type.to_s,
336
- }
335
+ { status: type.to_s }
337
336
  end
338
337
  end
339
338
 
339
+ def build_locality(clause)
340
+ return nil unless clause
341
+
342
+ Glossarist::Locality.new(
343
+ type: "clause",
344
+ reference_from: clause,
345
+ )
346
+ end
347
+
340
348
  # Uses Relaton to obtain link for given source ref.
341
349
  def obtain_source_link(ref)
350
+ return nil unless defined?(RelatonDb)
351
+
342
352
  RelatonDb.instance.fetch(ref)&.url
353
+ rescue ::RelatonBib::RequestError => e
354
+ warn e.message
355
+ nil
343
356
  end
344
357
  end
345
358
  end
@@ -9,8 +9,8 @@ module Iev
9
9
  #
10
10
  # @example
11
11
  # SupersessionParser.new(cell_data_string).supersessions
12
+ # # => [Glossarist::RelatedConcept, ...]
12
13
  class SupersessionParser
13
- include Cli::Ui
14
14
  using DataConversions
15
15
 
16
16
  attr_reader :raw_str, :src_str, :supersessions
@@ -52,18 +52,14 @@ module Iev
52
52
  end
53
53
 
54
54
  def relation_from_match(match_data)
55
- {
56
- "type" => "supersedes",
57
- "ref" => iev_ref_from_match(match_data),
58
- }
59
- end
60
-
61
- def iev_ref_from_match(match_data)
62
- {
63
- "source" => "IEV",
64
- "id" => match_data[:ref],
65
- "version" => match_data[:version],
66
- }
55
+ Glossarist::RelatedConcept.new(
56
+ type: "supersedes",
57
+ ref: Glossarist::Citation.new(
58
+ source: "IEV",
59
+ id: match_data[:ref],
60
+ version: match_data[:version],
61
+ ),
62
+ )
67
63
  end
68
64
  end
69
65
  end
@@ -13,7 +13,6 @@ module Iev
13
13
  # parser.plurality # returns grammatical plurality
14
14
  # parser.part_of_speech # returns part of speech
15
15
  class TermAttrsParser
16
- include Cli::Ui
17
16
  using DataConversions
18
17
 
19
18
  attr_reader :raw_str, :src_str, :gender, :geographical_area,
@@ -44,6 +43,19 @@ module Iev
44
43
  "<ATTRIBUTES: #{src_str}>".freeze
45
44
  end
46
45
 
46
+ # Constructs a Glossarist::Designation::GrammarInfo from the parsed
47
+ # gender, plurality, and part_of_speech attributes.
48
+ # Returns nil if none of these attributes were parsed.
49
+ def to_grammar_info
50
+ return nil unless gender || plurality || part_of_speech
51
+
52
+ Glossarist::Designation::GrammarInfo.new(
53
+ gender: gender ? [gender] : nil,
54
+ number: plurality ? [plurality] : nil,
55
+ part_of_speech: part_of_speech,
56
+ )
57
+ end
58
+
47
59
  private
48
60
 
49
61
  def parse
@@ -58,10 +70,7 @@ module Iev
58
70
 
59
71
  return unless /\p{Word}/.match?(curr_str)
60
72
 
61
- debug(
62
- :term_attributes,
63
- "Term attributes could not be parsed completely: '#{src_str}'",
64
- )
73
+ # Term attributes could not be parsed completely
65
74
  end
66
75
 
67
76
  def extract_gender(str)
@@ -130,11 +139,16 @@ module Iev
130
139
  \b
131
140
  /x
132
141
 
133
- @prefix = true if remove_from_string(str, prefix_rx)
142
+ removed = remove_from_string(str, prefix_rx)
143
+ @prefix = removed if removed
134
144
  end
135
145
 
136
146
  def decode_attrs_string(str)
137
- str.decode_html || ""
147
+ decoded = str.decode_html || ""
148
+ # Strip common HTML inline tags that appear in TERMATTRIBUTE data
149
+ # and would interfere with usage_info angle-bracket parsing.
150
+ # Only strip known HTML tags, not usage_info like <telecommunications>.
151
+ decoded.gsub(/<\/?(?:sup|sub|i|b|em|strong|span|small)>/, "")
138
152
  end
139
153
 
140
154
  def remove_from_string(string, regexp)