iev 0.4.4 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +7 -4
- data/.github/workflows/release.yml +2 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +4 -1
- data/.rubocop_todo.yml +98 -21
- data/CLAUDE.md +17 -5
- data/Gemfile +8 -4
- data/README.adoc +395 -10
- data/exe/iev +1 -1
- data/iev.gemspec +3 -2
- data/lib/iev/cli/command.rb +3 -2
- data/lib/iev/cli/command_helper.rb +1 -2
- data/lib/iev/cli/ui.rb +5 -5
- data/lib/iev/config.rb +1 -15
- data/lib/iev/data_source.rb +4 -2
- data/lib/iev/db_writer.rb +1 -0
- data/lib/iev/exporter.rb +182 -10
- data/lib/iev/iev_code.rb +80 -0
- data/lib/iev/iso_639_code.rb +2 -1
- data/lib/iev/relaton_db.rb +1 -1
- data/lib/iev/scraper/browser.rb +90 -88
- data/lib/iev/scraper.rb +5 -4
- data/lib/iev/section.rb +37 -0
- data/lib/iev/source_parser.rb +57 -11
- data/lib/iev/subject_area.rb +46 -0
- data/lib/iev/subject_area_concepts.rb +60 -35
- data/lib/iev/subject_areas.rb +72 -33
- data/lib/iev/supersession_parser.rb +1 -2
- data/lib/iev/term_attrs_parser.rb +1 -1
- data/lib/iev/term_builder.rb +14 -9
- data/lib/iev/utilities.rb +29 -1
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +43 -11
- metadata +26 -22
data/lib/iev/exporter.rb
CHANGED
|
@@ -55,7 +55,9 @@ module Iev
|
|
|
55
55
|
dataset = load_dataset
|
|
56
56
|
collection = build_collection(dataset)
|
|
57
57
|
add_subject_area_concepts(collection) if @include_areas
|
|
58
|
+
build_section_narrower_relations(collection) if @include_areas
|
|
58
59
|
save_collection(collection)
|
|
60
|
+
save_register
|
|
59
61
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
60
62
|
|
|
61
63
|
@stats = {
|
|
@@ -85,7 +87,7 @@ module Iev
|
|
|
85
87
|
|
|
86
88
|
exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
|
|
87
89
|
raise ArgumentError,
|
|
88
|
-
|
|
90
|
+
"Unsupported format: #{input_path.extname}. Supported: #{exts}"
|
|
89
91
|
end
|
|
90
92
|
|
|
91
93
|
def input_format
|
|
@@ -139,13 +141,23 @@ module Iev
|
|
|
139
141
|
term = TermBuilder.build_from(row)
|
|
140
142
|
next unless term
|
|
141
143
|
|
|
144
|
+
# Parse IevCode once per concept — used by all helpers below.
|
|
145
|
+
code = IevCode.new(term.id)
|
|
146
|
+
|
|
142
147
|
concept = concept_index[term.id] ||= begin
|
|
143
148
|
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
144
|
-
c.
|
|
149
|
+
c.uuid = term.id
|
|
150
|
+
c.schema_version = "3"
|
|
151
|
+
c.data.domains = domain_references_for(code)
|
|
152
|
+
c.data.tags = tags_for(code)
|
|
153
|
+
add_section_broader(c, code)
|
|
145
154
|
collection.store(c)
|
|
146
155
|
c
|
|
147
156
|
end
|
|
148
157
|
concept.add_l10n(term)
|
|
158
|
+
|
|
159
|
+
promote_supersession(concept, term)
|
|
160
|
+
set_managed_status(concept, term)
|
|
149
161
|
end
|
|
150
162
|
|
|
151
163
|
collection
|
|
@@ -160,21 +172,181 @@ module Iev
|
|
|
160
172
|
def save_collection(collection)
|
|
161
173
|
concepts_dir = output_dir.expand_path.join("concepts")
|
|
162
174
|
FileUtils.mkdir_p(concepts_dir)
|
|
163
|
-
collection.
|
|
175
|
+
collection.save_grouped_concepts_to_files(concepts_dir.to_s)
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def save_register
|
|
179
|
+
areas = SubjectAreas.all
|
|
180
|
+
sections = build_section_tree(areas)
|
|
181
|
+
|
|
182
|
+
register = Glossarist::DatasetRegister.new(
|
|
183
|
+
schema_type: "glossarist",
|
|
184
|
+
schema_version: "3",
|
|
185
|
+
id: "iev",
|
|
186
|
+
ref: "IEC 60050:2011",
|
|
187
|
+
year: 2011,
|
|
188
|
+
urn: IEV_SOURCE,
|
|
189
|
+
urn_aliases: ["#{IEV_SOURCE}*"],
|
|
190
|
+
status: "current",
|
|
191
|
+
owner: "IEC",
|
|
192
|
+
source_repo: "https://github.com/glossarist/iev-data",
|
|
193
|
+
tags: %w[electrotechnical vocabulary iec],
|
|
194
|
+
languages: %w[eng fra],
|
|
195
|
+
language_order: %w[eng fra],
|
|
196
|
+
ordering: "systematic",
|
|
197
|
+
sections: sections,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
register_path = output_dir.expand_path.join("register.yaml")
|
|
201
|
+
FileUtils.mkdir_p(register_path.dirname)
|
|
202
|
+
File.write(register_path, register.to_yaml, encoding: "utf-8")
|
|
203
|
+
puts "Written register.yaml with #{sections.length} areas" if $stdout.tty?
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def build_section_tree(areas)
|
|
207
|
+
areas.sort_by { |a| a.code.to_i }.map do |area|
|
|
208
|
+
children = area.sections.sort_by do |s|
|
|
209
|
+
s.code.split("-").map(&:to_i)
|
|
210
|
+
end.map do |sec|
|
|
211
|
+
Glossarist::Section.new(
|
|
212
|
+
id: sec.code,
|
|
213
|
+
names: { "eng" => sec.title },
|
|
214
|
+
)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
Glossarist::Section.new(
|
|
218
|
+
id: area.code,
|
|
219
|
+
names: { "eng" => area.title },
|
|
220
|
+
children: children.empty? ? nil : children,
|
|
221
|
+
)
|
|
222
|
+
end
|
|
164
223
|
end
|
|
165
224
|
|
|
166
225
|
def localized_count(collection)
|
|
167
226
|
collection.sum { |c| c.localized_concepts.count }
|
|
168
227
|
end
|
|
169
228
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
229
|
+
# Build domain ConceptReferences for a concept.
|
|
230
|
+
#
|
|
231
|
+
# Per the concept model, ConceptReferenceType distinguishes:
|
|
232
|
+
# - "domain" → thematic/subject-area classification (area level)
|
|
233
|
+
# - "section" → structural section membership (section level)
|
|
234
|
+
#
|
|
235
|
+
# Every concept gets both: a "domain" ref to its area and a "section"
|
|
236
|
+
# ref to its section. Concepts with only an area code (no section)
|
|
237
|
+
# get only a "domain" ref.
|
|
238
|
+
#
|
|
239
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
240
|
+
# @return [Array<Glossarist::ConceptReference>]
|
|
241
|
+
def domain_references_for(code)
|
|
242
|
+
refs = []
|
|
243
|
+
|
|
244
|
+
# Domain reference: thematic classification at the area level
|
|
245
|
+
refs << domain_ref(code.area_uri)
|
|
246
|
+
|
|
247
|
+
# Section reference: structural membership in the section
|
|
248
|
+
if code.section_code
|
|
249
|
+
refs << section_ref(code.section_uri)
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
refs
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
256
|
+
# @return [Array<String>]
|
|
257
|
+
def tags_for(code)
|
|
258
|
+
tags = []
|
|
259
|
+
area = SubjectAreas.find_area(code.area_code)
|
|
260
|
+
tags << area.title if area
|
|
261
|
+
section = code.section_code && SubjectAreas.find_section(code.section_code)
|
|
262
|
+
tags << section.title if section
|
|
263
|
+
tags
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# @param concept [Glossarist::ManagedConcept]
|
|
267
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
268
|
+
def add_section_broader(concept, code)
|
|
269
|
+
return unless code.section_uri
|
|
270
|
+
|
|
271
|
+
concept.related ||= []
|
|
272
|
+
return if concept.related.any? do |r|
|
|
273
|
+
r.type == "broader" && r.ref&.id == code.section_uri
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
concept.related << Glossarist::RelatedConcept.new(
|
|
277
|
+
type: "broader",
|
|
278
|
+
content: code.section_uri,
|
|
279
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: code.section_uri),
|
|
280
|
+
)
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def build_section_narrower_relations(collection)
|
|
284
|
+
mc_index = collection.each_with_object({}) do |c, h|
|
|
285
|
+
h[c.data&.id] = c if c.data&.id
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
section_children = {}
|
|
289
|
+
mc_index.each_key do |concept_id|
|
|
290
|
+
code = IevCode.new(concept_id)
|
|
291
|
+
next unless code.section_uri
|
|
292
|
+
|
|
293
|
+
(section_children[code.section_uri] ||= []) << concept_id
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
section_children.each do |section_uri, child_ids|
|
|
297
|
+
section_mc = mc_index[section_uri]
|
|
298
|
+
next unless section_mc
|
|
299
|
+
|
|
300
|
+
narrower = child_ids.sort.map do |child_id|
|
|
301
|
+
Glossarist::RelatedConcept.new(
|
|
302
|
+
type: "narrower",
|
|
303
|
+
content: child_id,
|
|
304
|
+
ref: Glossarist::ConceptRef.new(source: "IEV", id: child_id),
|
|
305
|
+
)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
section_mc.related ||= []
|
|
309
|
+
section_mc.related.concat(narrower)
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Promote supersedes relations from localized ConceptData to managed level.
|
|
314
|
+
# Supersession is language-independent (REPLACES column is per-concept).
|
|
315
|
+
def promote_supersession(concept, term)
|
|
316
|
+
related = term.data&.related
|
|
317
|
+
return unless related&.any?
|
|
318
|
+
|
|
319
|
+
concept.related ||= []
|
|
320
|
+
related.each do |r|
|
|
321
|
+
next if concept.related.any? do |er|
|
|
322
|
+
er.type == r.type && er.ref&.id == r.ref&.id
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
concept.related << r
|
|
326
|
+
end
|
|
327
|
+
term.data.related = nil
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Derive managed concept status from the localization's entry_status.
|
|
331
|
+
def set_managed_status(concept, term)
|
|
332
|
+
return if concept.status
|
|
333
|
+
|
|
334
|
+
status = term.entry_status
|
|
335
|
+
concept.status = status if status && !status.empty?
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
# --- ConceptReference factory helpers ---
|
|
339
|
+
|
|
340
|
+
def domain_ref(concept_id)
|
|
341
|
+
ref = Glossarist::ConceptReference.domain(concept_id)
|
|
342
|
+
ref.source = IEV_SOURCE
|
|
343
|
+
ref
|
|
344
|
+
end
|
|
173
345
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
346
|
+
def section_ref(concept_id)
|
|
347
|
+
ref = Glossarist::ConceptReference.section(concept_id)
|
|
348
|
+
ref.source = IEV_SOURCE
|
|
349
|
+
ref
|
|
178
350
|
end
|
|
179
351
|
end
|
|
180
352
|
end
|
data/lib/iev/iev_code.rb
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Immutable value object that decomposes an IEV concept code
|
|
5
|
+
# into its structural parts: area code, section code, and number.
|
|
6
|
+
#
|
|
7
|
+
# The IEV code format is AAA-BB-CC where:
|
|
8
|
+
# AAA = area code (e.g. "103")
|
|
9
|
+
# BB = section sub-code (e.g. "01")
|
|
10
|
+
# CC = concept number (e.g. "02")
|
|
11
|
+
#
|
|
12
|
+
# @example Full concept code
|
|
13
|
+
# code = Iev::IevCode.new("103-01-02")
|
|
14
|
+
# code.area_code #=> "103"
|
|
15
|
+
# code.section_code #=> "103-01"
|
|
16
|
+
# code.number #=> "02"
|
|
17
|
+
# code.area_uri #=> "area-103"
|
|
18
|
+
# code.section_uri #=> "section-103-01"
|
|
19
|
+
#
|
|
20
|
+
# @example Section code (no concept number)
|
|
21
|
+
# code = Iev::IevCode.new("103-01")
|
|
22
|
+
# code.area_code #=> "103"
|
|
23
|
+
# code.section_code #=> "103-01"
|
|
24
|
+
# code.number #=> nil
|
|
25
|
+
# code.section_uri #=> "section-103-01"
|
|
26
|
+
#
|
|
27
|
+
class IevCode
|
|
28
|
+
include Comparable
|
|
29
|
+
|
|
30
|
+
attr_reader :raw, :area_code, :section_code, :number
|
|
31
|
+
|
|
32
|
+
# @param code [#to_s] IEV reference, e.g. "103-01-02"
|
|
33
|
+
def initialize(code)
|
|
34
|
+
@raw = code.to_s
|
|
35
|
+
parts = @raw.split("-")
|
|
36
|
+
@area_code = parts[0]
|
|
37
|
+
@section_code = parts.length >= 2 ? "#{parts[0]}-#{parts[1]}" : nil
|
|
38
|
+
@number = parts.length >= 3 ? parts[2] : nil
|
|
39
|
+
freeze
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def area_uri
|
|
43
|
+
"area-#{area_code}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def section_uri
|
|
47
|
+
"section-#{section_code}" if section_code
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def to_s
|
|
51
|
+
@raw
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def to_str
|
|
55
|
+
@raw
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def ==(other)
|
|
59
|
+
other.is_a?(self.class) && raw == other.raw
|
|
60
|
+
end
|
|
61
|
+
alias_method :eql?, :==
|
|
62
|
+
|
|
63
|
+
def hash
|
|
64
|
+
raw.hash
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def <=>(other)
|
|
68
|
+
to_s <=> other.to_s
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Safe constructor that returns nil for codes that don't parse.
|
|
72
|
+
# @param code [#to_s]
|
|
73
|
+
# @return [IevCode, nil]
|
|
74
|
+
def self.parse(code)
|
|
75
|
+
new(code)
|
|
76
|
+
rescue ArgumentError
|
|
77
|
+
nil
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/iev/iso_639_code.rb
CHANGED
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
module Iev
|
|
7
7
|
# @todo This needs to be rewritten.
|
|
8
8
|
class Iso639Code
|
|
9
|
-
COUNTRY_CODES = YAML.
|
|
9
|
+
COUNTRY_CODES = YAML.safe_load_file(File.join(__dir__,
|
|
10
|
+
"iso_639_2.yaml"), permitted_classes: [Symbol]).freeze
|
|
10
11
|
# rubocop:disable Style/MutableConstant
|
|
11
12
|
THREE_CHAR_MEMO = {} # Memoization cache, must be mutable
|
|
12
13
|
# rubocop:enable Style/MutableConstant
|
data/lib/iev/relaton_db.rb
CHANGED
data/lib/iev/scraper/browser.rb
CHANGED
|
@@ -3,100 +3,102 @@
|
|
|
3
3
|
require "ferrum"
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
].freeze
|
|
45
|
-
|
|
46
|
-
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
47
|
-
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
48
|
-
def self.fetch(url, browser_opts: {})
|
|
49
|
-
browser = Ferrum::Browser.new(
|
|
50
|
-
headless: "new",
|
|
51
|
-
timeout: 30,
|
|
52
|
-
window_size: [1366, 768],
|
|
53
|
-
browser_options: {
|
|
54
|
-
"disable-blink-features" => "AutomationControlled",
|
|
6
|
+
class Scraper
|
|
7
|
+
# Shared headless browser utilities for fetching pages behind AWS WAF.
|
|
8
|
+
module Browser
|
|
9
|
+
USER_AGENT_PROFILES = [
|
|
10
|
+
{
|
|
11
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
12
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
13
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
14
|
+
platform: '"macOS"',
|
|
15
|
+
chrome_version: "131",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
19
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
20
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
21
|
+
platform: '"Windows"',
|
|
22
|
+
chrome_version: "130",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
26
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
27
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
28
|
+
platform: '"Linux"',
|
|
29
|
+
chrome_version: "131",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
33
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
34
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
35
|
+
platform: '"macOS"',
|
|
36
|
+
chrome_version: "129",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
40
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
41
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
42
|
+
platform: '"Windows"',
|
|
43
|
+
chrome_version: "131",
|
|
55
44
|
},
|
|
56
|
-
|
|
57
|
-
)
|
|
45
|
+
].freeze
|
|
58
46
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
47
|
+
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
48
|
+
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
49
|
+
def self.fetch(url, browser_opts: {})
|
|
50
|
+
browser = Ferrum::Browser.new(
|
|
51
|
+
headless: "new",
|
|
52
|
+
timeout: 30,
|
|
53
|
+
window_size: [1366, 768],
|
|
54
|
+
browser_options: {
|
|
55
|
+
"disable-blink-features" => "AutomationControlled",
|
|
56
|
+
},
|
|
57
|
+
**browser_opts,
|
|
58
|
+
)
|
|
63
59
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
60
|
+
browser.headers.set(random_headers)
|
|
61
|
+
browser.go_to(url)
|
|
62
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
63
|
+
html = browser.body
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
ensure
|
|
74
|
-
browser&.quit
|
|
75
|
-
end
|
|
65
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
66
|
+
warn "IEV: AWS WAF blocked request for #{url}"
|
|
67
|
+
return nil
|
|
68
|
+
end
|
|
76
69
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
70
|
+
html
|
|
71
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
72
|
+
warn "IEV: Browser error fetching #{url}: #{e.message}"
|
|
73
|
+
nil
|
|
74
|
+
ensure
|
|
75
|
+
browser&.quit
|
|
76
|
+
end
|
|
82
77
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
78
|
+
def self.random_headers
|
|
79
|
+
profile = USER_AGENT_PROFILES.sample
|
|
80
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
81
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
82
|
+
"\"Not_A Brand\";v=\"24\""
|
|
83
|
+
|
|
84
|
+
{
|
|
85
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
86
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
87
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
88
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
89
|
+
"Cache-Control" => "no-cache",
|
|
90
|
+
"Pragma" => "no-cache",
|
|
91
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
92
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
93
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
94
|
+
"Sec-Fetch-Dest" => "document",
|
|
95
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
96
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
97
|
+
"Sec-Fetch-User" => "?1",
|
|
98
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
99
|
+
"User-Agent" => profile[:user_agent],
|
|
100
|
+
}
|
|
101
|
+
end
|
|
100
102
|
end
|
|
101
103
|
end
|
|
102
104
|
end
|
data/lib/iev/scraper.rb
CHANGED
|
@@ -4,6 +4,9 @@ require "nokogiri"
|
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
6
|
class Scraper
|
|
7
|
+
autoload :Browser, "iev/scraper/browser"
|
|
8
|
+
autoload :PageParser, "iev/scraper/page_parser"
|
|
9
|
+
|
|
7
10
|
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
8
11
|
"display?openform&ievref="
|
|
9
12
|
|
|
@@ -14,7 +17,8 @@ module Iev
|
|
|
14
17
|
# Fetch the Electropedia page HTML for a given IEV code.
|
|
15
18
|
# Returns a Nokogiri document.
|
|
16
19
|
def fetch_page(code)
|
|
17
|
-
html =
|
|
20
|
+
html = Browser.fetch("#{BASE_URL}#{code}",
|
|
21
|
+
browser_opts: @browser_opts)
|
|
18
22
|
return nil unless html
|
|
19
23
|
|
|
20
24
|
Nokogiri::HTML(html)
|
|
@@ -30,6 +34,3 @@ module Iev
|
|
|
30
34
|
end
|
|
31
35
|
end
|
|
32
36
|
end
|
|
33
|
-
|
|
34
|
-
require_relative "scraper/browser"
|
|
35
|
-
require_relative "scraper/page_parser"
|
data/lib/iev/section.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Immutable value object representing an IEV section (e.g. "103-01").
|
|
5
|
+
#
|
|
6
|
+
# A section belongs to exactly one area, identified by +area_code+.
|
|
7
|
+
class Section
|
|
8
|
+
attr_reader :code, :title, :area_code
|
|
9
|
+
|
|
10
|
+
# @param code [#to_s] section code, e.g. "103-01"
|
|
11
|
+
# @param title [#to_s] section title, e.g. "General concepts on functions"
|
|
12
|
+
# @param area_code [#to_s] parent area code, e.g. "103"
|
|
13
|
+
def initialize(code:, title:, area_code:)
|
|
14
|
+
@code = code.to_s
|
|
15
|
+
@title = title.to_s
|
|
16
|
+
@area_code = area_code.to_s
|
|
17
|
+
freeze
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def uri
|
|
21
|
+
"section-#{code}"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def to_h
|
|
25
|
+
{ "code" => code, "title" => title }
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def ==(other)
|
|
29
|
+
other.is_a?(self.class) && code == other.code
|
|
30
|
+
end
|
|
31
|
+
alias_method :eql?, :==
|
|
32
|
+
|
|
33
|
+
def hash
|
|
34
|
+
code.hash
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|