iev 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -142,6 +142,30 @@ module Iev
142
142
  summary
143
143
  end
144
144
 
145
+ desc "subject_areas", "Fetch IEV subject areas and sections from Electropedia."
146
+ option :output, desc: "Output YAML file (default: stdout)", aliases: :o
147
+ option :refresh, type: :boolean, default: false,
148
+ desc: "Force re-fetch even if cached"
149
+ def subject_areas
150
+ if options[:refresh]
151
+ cache_path = File.join(Iev.config.cache_dir, "subject_areas.yaml")
152
+ FileUtils.rm_f(cache_path) if File.exist?(cache_path)
153
+ end
154
+
155
+ result = Iev::SubjectAreas.fetch
156
+
157
+ yaml = YAML.dump(result)
158
+ if options[:output]
159
+ File.write(options[:output], yaml, encoding: "utf-8")
160
+ puts "Written to #{options[:output]}"
161
+ else
162
+ puts yaml
163
+ end
164
+ rescue Iev::SubjectAreas::FetchError => e
165
+ error e.message
166
+ exit 1
167
+ end
168
+
145
169
  desc "fetch CODE", "Fetch an IEV concept and output YAML to stdout."
146
170
  option :scrape, type: :boolean, default: false,
147
171
  desc: "Scrape from Electropedia instead of using cached data"
data/lib/iev/exporter.rb CHANGED
@@ -28,16 +28,19 @@ module Iev
28
28
  # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
29
29
  # @param only_languages [String, nil] comma-separated language codes
30
30
  # @param fetch_relaton_links [Boolean] fetch source URLs via Relaton
31
+ # @param include_areas [Boolean] create area/section hierarchy concepts
31
32
  # @param on_progress [Proc, nil] callback (current, total) during build
32
33
  def initialize(input_path, output_dir: Dir.pwd,
33
34
  only_concepts: nil, only_languages: nil,
34
35
  fetch_relaton_links: false,
36
+ include_areas: true,
35
37
  on_progress: nil)
36
38
  @input_path = Pathname.new(input_path)
37
39
  validate_input!
38
40
 
39
41
  @output_dir = Pathname.new(output_dir)
40
42
  @fetch_relaton_links = fetch_relaton_links
43
+ @include_areas = include_areas
41
44
  @on_progress = on_progress
42
45
  @filters = {
43
46
  only_concepts: only_concepts,
@@ -51,6 +54,7 @@ module Iev
51
54
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
52
55
  dataset = load_dataset
53
56
  collection = build_collection(dataset)
57
+ add_subject_area_concepts(collection) if @include_areas
54
58
  save_collection(collection)
55
59
  elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
56
60
 
@@ -137,6 +141,7 @@ module Iev
137
141
 
138
142
  concept = concept_index[term.id] ||= begin
139
143
  c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
144
+ c.data.domains = domain_references_for(term.id)
140
145
  collection.store(c)
141
146
  c
142
147
  end
@@ -148,6 +153,10 @@ module Iev
148
153
  SourceParser.relaton_enabled = true
149
154
  end
150
155
 
156
+ def add_subject_area_concepts(collection)
157
+ SubjectAreaConcepts.add_to(collection)
158
+ end
159
+
151
160
  def save_collection(collection)
152
161
  concepts_dir = output_dir.expand_path.join("concepts")
153
162
  FileUtils.mkdir_p(concepts_dir)
@@ -157,5 +166,15 @@ module Iev
157
166
  def localized_count(collection)
158
167
  collection.sum { |c| c.localized_concepts.count }
159
168
  end
169
+
170
+ def domain_references_for(ievref)
171
+ parts = ievref.to_s.split("-")
172
+ return [] unless parts.length >= 2
173
+
174
+ [
175
+ SubjectAreas.area_uri(parts[0]),
176
+ SubjectAreas.section_uri(parts[0..1].join("-")),
177
+ ].map { |id| Glossarist::ConceptReference.domain(id) }
178
+ end
160
179
  end
161
180
  end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ferrum"
4
+
5
+ module Iev
6
+ # Shared headless browser utilities for fetching pages behind AWS WAF.
7
+ module ScraperBrowser
8
+ USER_AGENT_PROFILES = [
9
+ {
10
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
11
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
12
+ "Chrome/131.0.0.0 Safari/537.36",
13
+ platform: '"macOS"',
14
+ chrome_version: "131",
15
+ },
16
+ {
17
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
18
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
19
+ "Chrome/130.0.0.0 Safari/537.36",
20
+ platform: '"Windows"',
21
+ chrome_version: "130",
22
+ },
23
+ {
24
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
25
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
26
+ "Chrome/131.0.0.0 Safari/537.36",
27
+ platform: '"Linux"',
28
+ chrome_version: "131",
29
+ },
30
+ {
31
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
32
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
33
+ "Chrome/129.0.0.0 Safari/537.36",
34
+ platform: '"macOS"',
35
+ chrome_version: "129",
36
+ },
37
+ {
38
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
39
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
40
+ "Chrome/131.0.0.0 Safari/537.36",
41
+ platform: '"Windows"',
42
+ chrome_version: "131",
43
+ },
44
+ ].freeze
45
+
46
+ # Fetch a URL using headless Chrome, returning the page HTML.
47
+ # Handles AWS WAF challenge pages by waiting for JS execution.
48
+ def self.fetch(url, browser_opts: {})
49
+ browser = Ferrum::Browser.new(
50
+ headless: "new",
51
+ timeout: 30,
52
+ window_size: [1366, 768],
53
+ browser_options: {
54
+ "disable-blink-features" => "AutomationControlled",
55
+ },
56
+ **browser_opts,
57
+ )
58
+
59
+ browser.headers.set(random_headers)
60
+ browser.go_to(url)
61
+ browser.network.wait_for_idle(timeout: 15)
62
+ html = browser.body
63
+
64
+ if html.include?("403 ERROR") || html.include?("Request blocked")
65
+ warn "IEV: AWS WAF blocked request for #{url}"
66
+ return nil
67
+ end
68
+
69
+ html
70
+ rescue Ferrum::Error, Ferrum::BrowserError => e
71
+ warn "IEV: Browser error fetching #{url}: #{e.message}"
72
+ nil
73
+ ensure
74
+ browser&.quit
75
+ end
76
+
77
+ def self.random_headers
78
+ profile = USER_AGENT_PROFILES.sample
79
+ sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
80
+ "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
81
+ "\"Not_A Brand\";v=\"24\""
82
+
83
+ {
84
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
85
+ "image/avif,image/webp,image/apng,*/*;q=0.8," \
86
+ "application/signed-exchange;v=b3;q=0.7",
87
+ "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
88
+ "Cache-Control" => "no-cache",
89
+ "Pragma" => "no-cache",
90
+ "Sec-Ch-Ua" => sec_ch_ua,
91
+ "Sec-Ch-Ua-Mobile" => "?0",
92
+ "Sec-Ch-Ua-Platform" => profile[:platform],
93
+ "Sec-Fetch-Dest" => "document",
94
+ "Sec-Fetch-Mode" => "navigate",
95
+ "Sec-Fetch-Site" => "cross-site",
96
+ "Sec-Fetch-User" => "?1",
97
+ "Upgrade-Insecure-Requests" => "1",
98
+ "User-Agent" => profile[:user_agent],
99
+ }
100
+ end
101
+ end
102
+ end
data/lib/iev/scraper.rb CHANGED
@@ -1,59 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "nokogiri"
4
+
3
5
  module Iev
4
- # Scrapes IEV term data from Electropedia (electropedia.org).
5
- #
6
- # Electropedia is behind AWS WAF which requires JavaScript execution,
7
- # so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
8
- #
9
- # @example
10
- # scraper = Iev::Scraper.new
11
- # concept = scraper.fetch_concept("103-01-02")
12
- # doc = scraper.fetch_page("103-01-02")
13
6
  class Scraper
14
7
  BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
15
8
  "display?openform&ievref="
16
9
 
17
- # Pool of realistic Chrome User-Agent strings with matching platform hints.
18
- # Rotated per request to reduce fingerprinting by AWS WAF.
19
- USER_AGENT_PROFILES = [
20
- {
21
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
22
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
23
- "Chrome/131.0.0.0 Safari/537.36",
24
- platform: '"macOS"',
25
- chrome_version: "131",
26
- },
27
- {
28
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
29
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
30
- "Chrome/130.0.0.0 Safari/537.36",
31
- platform: '"Windows"',
32
- chrome_version: "130",
33
- },
34
- {
35
- user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
36
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
37
- "Chrome/131.0.0.0 Safari/537.36",
38
- platform: '"Linux"',
39
- chrome_version: "131",
40
- },
41
- {
42
- user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
43
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
44
- "Chrome/129.0.0.0 Safari/537.36",
45
- platform: '"macOS"',
46
- chrome_version: "129",
47
- },
48
- {
49
- user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
50
- "AppleWebKit/537.36 (KHTML, like Gecko) " \
51
- "Chrome/131.0.0.0 Safari/537.36",
52
- platform: '"Windows"',
53
- chrome_version: "131",
54
- },
55
- ].freeze
56
-
57
10
  def initialize(browser_opts: {})
58
11
  @browser_opts = browser_opts
59
12
  end
@@ -61,37 +14,10 @@ module Iev
61
14
  # Fetch the Electropedia page HTML for a given IEV code.
62
15
  # Returns a Nokogiri document.
63
16
  def fetch_page(code)
64
- require "ferrum"
65
- require "nokogiri"
66
-
67
- url = "#{BASE_URL}#{code}"
68
- browser = Ferrum::Browser.new(
69
- headless: "new",
70
- timeout: 30,
71
- window_size: [1366, 768],
72
- browser_options: {
73
- "disable-blink-features" => "AutomationControlled",
74
- },
75
- **@browser_opts,
76
- )
77
-
78
- browser.headers.set(random_headers)
79
- browser.go_to(url)
80
- browser.network.wait_for_idle(timeout: 15)
81
- html = browser.body
82
-
83
- # Check if we got a real page or a WAF block
84
- if html.include?("403 ERROR") || html.include?("Request blocked")
85
- warn "IEV Scraper: AWS WAF blocked request for #{code}"
86
- return nil
87
- end
17
+ html = ScraperBrowser.fetch("#{BASE_URL}#{code}", browser_opts: @browser_opts)
18
+ return nil unless html
88
19
 
89
20
  Nokogiri::HTML(html)
90
- rescue Ferrum::Error, Ferrum::BrowserError => e
91
- warn "IEV Scraper error for #{code}: #{e.message}"
92
- nil
93
- ensure
94
- browser&.quit
95
21
  end
96
22
 
97
23
  # Fetch and parse concept data for an IEV code.
@@ -102,34 +28,8 @@ module Iev
102
28
 
103
29
  PageParser.new(doc, code).parse
104
30
  end
105
-
106
- private
107
-
108
- def random_headers
109
- profile = USER_AGENT_PROFILES.sample
110
- sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
111
- "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
112
- "\"Not_A Brand\";v=\"24\""
113
-
114
- {
115
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
116
- "image/avif,image/webp,image/apng,*/*;q=0.8," \
117
- "application/signed-exchange;v=b3;q=0.7",
118
- "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
119
- "Cache-Control" => "no-cache",
120
- "Pragma" => "no-cache",
121
- "Sec-Ch-Ua" => sec_ch_ua,
122
- "Sec-Ch-Ua-Mobile" => "?0",
123
- "Sec-Ch-Ua-Platform" => profile[:platform],
124
- "Sec-Fetch-Dest" => "document",
125
- "Sec-Fetch-Mode" => "navigate",
126
- "Sec-Fetch-Site" => "cross-site",
127
- "Sec-Fetch-User" => "?1",
128
- "Upgrade-Insecure-Requests" => "1",
129
- "User-Agent" => profile[:user_agent],
130
- }
131
- end
132
31
  end
133
32
  end
134
33
 
34
+ require_relative "scraper/browser"
135
35
  require_relative "scraper/page_parser"
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Iev
4
+ # Creates ManagedConcept entries for the IEV subject area hierarchy.
5
+ #
6
+ # The hierarchy has two levels:
7
+ # - Area (e.g., "102" = "Mathematics - General concepts and linear algebra")
8
+ # - Section (e.g., "102-01" = "Sets and operations")
9
+ #
10
+ # Linking:
11
+ # - Each IEV concept's ConceptData#domain references its section URI
12
+ # - Each IEV concept's ManagedConceptData#domains includes area and section codes
13
+ # - Each section concept has a "broader" relation to its parent area
14
+ # - Each area concept has "narrower" relations to its sections
15
+ module SubjectAreaConcepts
16
+ class << self
17
+ # Build all area and section concepts and add them to the collection.
18
+ #
19
+ # @param collection [Glossarist::ManagedConceptCollection]
20
+ # @return [void]
21
+ def add_to(collection)
22
+ Iev.subject_areas.each do |area|
23
+ area_mc = build_area_concept(area)
24
+ collection.store(area_mc)
25
+
26
+ (area["sections"] || []).each do |section|
27
+ section_mc = build_section_concept(section, area)
28
+ collection.store(section_mc)
29
+ end
30
+ end
31
+ end
32
+
33
+ private
34
+
35
+ def build_area_concept(area)
36
+ id = SubjectAreas.area_uri(area["code"])
37
+
38
+ mc = Glossarist::ManagedConcept.new(
39
+ data: Glossarist::ManagedConceptData.new(
40
+ id: id,
41
+ domains: [Glossarist::ConceptReference.domain(id)],
42
+ ),
43
+ )
44
+
45
+ mc.add_localization(build_localization(id, area["title"], "eng"))
46
+
47
+ narrower = (area["sections"] || []).map { |s| build_narrower_ref(s["code"]) }
48
+ mc.related = narrower unless narrower.empty?
49
+
50
+ mc
51
+ end
52
+
53
+ def build_section_concept(section, area)
54
+ id = SubjectAreas.section_uri(section["code"])
55
+
56
+ mc = Glossarist::ManagedConcept.new(
57
+ data: Glossarist::ManagedConceptData.new(
58
+ id: id,
59
+ domains: [
60
+ Glossarist::ConceptReference.domain(SubjectAreas.area_uri(area["code"])),
61
+ Glossarist::ConceptReference.domain(id),
62
+ ],
63
+ ),
64
+ )
65
+
66
+ cd = build_concept_data(id, section["title"], "eng")
67
+ cd.domain = SubjectAreas.area_uri(area["code"])
68
+ cd.related = [build_broader_ref(area["code"])]
69
+
70
+ mc.add_localization(build_localization_from_data(id, cd))
71
+ mc
72
+ end
73
+
74
+ def build_concept_data(id, title, lang_code)
75
+ Glossarist::ConceptData.new(
76
+ id: id,
77
+ language_code: lang_code,
78
+ terms: [
79
+ Glossarist::Designation::Expression.new(
80
+ type: "expression",
81
+ designation: title,
82
+ normative_status: "preferred",
83
+ ),
84
+ ],
85
+ )
86
+ end
87
+
88
+ def build_localization(id, title, lang_code)
89
+ cd = build_concept_data(id, title, lang_code)
90
+
91
+ l10n = Glossarist::LocalizedConcept.new
92
+ l10n.data = cd
93
+ l10n.id = id
94
+ l10n.entry_status = "valid"
95
+ l10n.data.review_decision_event = "published"
96
+ l10n
97
+ end
98
+
99
+ def build_localization_from_data(id, concept_data)
100
+ l10n = Glossarist::LocalizedConcept.new
101
+ l10n.data = concept_data
102
+ l10n.id = id
103
+ l10n.entry_status = "valid"
104
+ l10n.data.review_decision_event = "published"
105
+ l10n
106
+ end
107
+
108
+ def build_broader_ref(area_code)
109
+ Glossarist::RelatedConcept.new(
110
+ type: "broader",
111
+ content: SubjectAreas.area_uri(area_code),
112
+ )
113
+ end
114
+
115
+ def build_narrower_ref(section_code)
116
+ Glossarist::RelatedConcept.new(
117
+ type: "narrower",
118
+ content: SubjectAreas.section_uri(section_code),
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "nokogiri"
5
+ require "fileutils"
6
+ require "iev/config"
7
+
8
+ module Iev
9
+ module SubjectAreas
10
+ DATA_FILE = File.expand_path("../../data/subject_areas.yaml", __dir__)
11
+
12
+ AREAS_URL = "https://electropedia.org/iev/iev.nsf/" \
13
+ "6d6bdd8667c378f7c12581fa003d80e7?OpenForm"
14
+ SECTIONS_URL_TEMPLATE = "https://electropedia.org/iev/iev.nsf/" \
15
+ "index?openform&part=%<part>s"
16
+
17
+ MIN_PAGE_SIZE = 15_000
18
+
19
+ FETCH_DELAY = 5
20
+ RETRY_DELAY = 30
21
+ MAX_RETRIES = 2
22
+
23
+ class FetchError < StandardError; end
24
+
25
+ class << self
26
+ # --- URI scheme ---
27
+
28
+ # URI for a subject area concept.
29
+ # @param code [String, Integer] e.g. "102"
30
+ # @return [String] e.g. "area-102"
31
+ def area_uri(code)
32
+ "area-#{code}"
33
+ end
34
+
35
+ # URI for a section concept.
36
+ # @param code [String] e.g. "103-01"
37
+ # @return [String] e.g. "section-103-01"
38
+ def section_uri(code)
39
+ "section-#{code}"
40
+ end
41
+
42
+ # --- Query API (reads from bundled data) ---
43
+
44
+ # Return all subject areas with their sections.
45
+ # @return [Array<Hash>] each hash has "code", "title", "sections"
46
+ def all
47
+ data["areas"]
48
+ end
49
+
50
+ # Find a single subject area by its numeric code.
51
+ # @param code [String, Integer] e.g. "102" or 102
52
+ # @return [Hash, nil]
53
+ def find_area(code)
54
+ all.find { |a| a["code"] == code.to_s }
55
+ end
56
+
57
+ # Return all sections for a given area code.
58
+ # @param code [String, Integer] area code, e.g. "102"
59
+ # @return [Array<Hash>] each hash has "code", "title"
60
+ def sections_for(code)
61
+ area = find_area(code)
62
+ area ? area["sections"] : []
63
+ end
64
+
65
+ # Find a single section by its section code.
66
+ # @param section_code [String] e.g. "102-01"
67
+ # @return [Hash, nil]
68
+ def find_section(section_code)
69
+ sc = section_code.to_s
70
+ all.each do |area|
71
+ found = area["sections"]&.find { |s| s["code"] == sc }
72
+ return found if found
73
+ end
74
+ nil
75
+ end
76
+
77
+ # Return the parent area for a given section code.
78
+ # @param section_code [String] e.g. "102-01"
79
+ # @return [Hash, nil]
80
+ def area_for_section(section_code)
81
+ sc = section_code.to_s
82
+ all.find do |area|
83
+ area["sections"]&.any? { |s| s["code"] == sc }
84
+ end
85
+ end
86
+
87
+ # --- Fetching (network, writes to bundled data file) ---
88
+
89
+ def fetch
90
+ cached = read_cache("subject_areas.yaml")
91
+ return cached if cached && complete?(cached)
92
+
93
+ areas = cached ? cached["areas"] : []
94
+ fresh_areas = fetch_areas
95
+ puts "Found #{fresh_areas.length} areas (#{areas.length} cached)" if $stdout.tty?
96
+
97
+ # Merge: keep existing sections, add new areas
98
+ existing = areas.each_with_object({}) { |a, h| h[a["code"]] = a }
99
+ fresh_areas.each do |fa|
100
+ existing[fa["code"]] ||= fa
101
+ end
102
+ areas = fresh_areas.map { |fa| existing[fa["code"]] || fa }
103
+
104
+ areas.each_with_index do |area, i|
105
+ next if area["fetched"]
106
+
107
+ begin
108
+ area["sections"] = fetch_sections(area["code"])
109
+ area["fetched"] = true
110
+ rescue FetchError
111
+ area["sections"] ||= []
112
+ warn "IEV: Skipping area #{area["code"]} due to WAF"
113
+ end
114
+
115
+ puts "[#{i + 1}/#{areas.length}] #{area["code"]}: #{area["title"]} — #{area["sections"].length} sections" if $stdout.tty?
116
+
117
+ # Save progress every 10 areas so partial results survive WAF failures
118
+ if (i + 1) % 10 == 0
119
+ write_cache("subject_areas.yaml", { "areas" => areas })
120
+ end
121
+
122
+ sleep FETCH_DELAY unless i == areas.length - 1
123
+ end
124
+
125
+ result = { "areas" => areas }
126
+ write_cache("subject_areas.yaml", result)
127
+ result
128
+ end
129
+
130
+ def fetch_areas
131
+ html = fetch_page_with_retry(AREAS_URL)
132
+ doc = Nokogiri::HTML(html)
133
+
134
+ areas = []
135
+ doc.css("a").each do |link|
136
+ href = link["href"].to_s
137
+ next unless href.include?("part=")
138
+
139
+ code = href.match(/part=(\d+)/)&.[](1)
140
+ next unless code
141
+
142
+ title = link.text.strip
143
+ next if title.empty?
144
+
145
+ areas << { "code" => code, "title" => title, "sections" => [] }
146
+ end
147
+
148
+ areas.uniq { |a| a["code"] }
149
+ end
150
+
151
+ def fetch_sections(part)
152
+ url = format(SECTIONS_URL_TEMPLATE, part: part)
153
+ html = fetch_page_with_retry(url)
154
+ doc = Nokogiri::HTML(html)
155
+
156
+ sections = []
157
+ doc.css("td").each do |td|
158
+ text = td.text.strip
159
+ if (m = text.match(/\ASection\s+([\d-]+):\s*(.+)\z/))
160
+ sections << { "code" => m[1], "title" => m[2].strip }
161
+ end
162
+ end
163
+
164
+ sections.uniq { |s| s["code"] }
165
+ end
166
+
167
+ private
168
+
169
+ def data
170
+ @data ||= begin
171
+ path = File.exist?(DATA_FILE) ? DATA_FILE : nil
172
+ if path
173
+ YAML.safe_load(File.read(path, encoding: "utf-8")) || { "areas" => [] }
174
+ else
175
+ { "areas" => [] }
176
+ end
177
+ end
178
+ end
179
+
180
+ def complete?(data)
181
+ areas = data["areas"]
182
+ return false unless areas&.length&.>= 99
183
+
184
+ areas.all? { |a| a["fetched"] == true }
185
+ end
186
+
187
+ def captcha_page?(html)
188
+ html.length < MIN_PAGE_SIZE ||
189
+ html.include?("Confirm you are human") ||
190
+ html.include?("solve a puzzle") ||
191
+ html.include?("security check before continuing")
192
+ end
193
+
194
+ def fetch_page_with_retry(url, retries: MAX_RETRIES)
195
+ require "iev/scraper/browser"
196
+
197
+ retries.times do |attempt|
198
+ html = ScraperBrowser.fetch(url)
199
+ raise FetchError, "Failed to fetch #{url}" unless html
200
+
201
+ unless captcha_page?(html)
202
+ return html
203
+ end
204
+
205
+ if attempt < retries - 1
206
+ wait = RETRY_DELAY * (attempt + 1)
207
+ warn "IEV: WAF challenge for #{url}, retrying in #{wait}s (attempt #{attempt + 1}/#{retries})"
208
+ sleep wait
209
+ else
210
+ raise FetchError, "WAF challenge for #{url}"
211
+ end
212
+ end
213
+ end
214
+
215
+ def read_cache(filename)
216
+ cache_path = File.join(Iev.config.cache_dir, filename)
217
+ return nil unless File.exist?(cache_path)
218
+
219
+ d = YAML.safe_load(File.read(cache_path, encoding: "utf-8"))
220
+ return nil unless d&.dig("areas")&.any?
221
+
222
+ d
223
+ end
224
+
225
+ def write_cache(filename, d)
226
+ cache_path = File.join(Iev.config.cache_dir, filename)
227
+ FileUtils.mkdir_p(File.dirname(cache_path))
228
+ File.write(cache_path, YAML.dump(d), encoding: "utf-8")
229
+ end
230
+ end
231
+ end
232
+ end