iev 0.4.5 → 0.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +56 -21
- data/CLAUDE.md +27 -5
- data/Gemfile +9 -0
- data/README.adoc +10 -3
- data/iev.gemspec +3 -2
- data/lib/iev/bibliography_builder.rb +87 -0
- data/lib/iev/cli/command.rb +3 -2
- data/lib/iev/cli/ui.rb +5 -5
- data/lib/iev/config.rb +1 -15
- data/lib/iev/data_source.rb +4 -2
- data/lib/iev/db_writer.rb +1 -0
- data/lib/iev/exporter.rb +139 -21
- data/lib/iev/figure_builder.rb +186 -0
- data/lib/iev/iso_639_code.rb +2 -1
- data/lib/iev/relaton_db.rb +1 -1
- data/lib/iev/scraper/browser.rb +90 -88
- data/lib/iev/scraper.rb +5 -4
- data/lib/iev/source_parser.rb +9 -10
- data/lib/iev/subject_area_concepts.rb +36 -33
- data/lib/iev/subject_areas.rb +9 -11
- data/lib/iev/term_attrs_parser.rb +1 -1
- data/lib/iev/term_builder.rb +14 -9
- data/lib/iev/utilities.rb +29 -1
- data/lib/iev/version.rb +1 -1
- data/lib/iev.rb +30 -6
- metadata +24 -27
data/lib/iev/exporter.rb
CHANGED
|
@@ -56,12 +56,18 @@ module Iev
|
|
|
56
56
|
collection = build_collection(dataset)
|
|
57
57
|
add_subject_area_concepts(collection) if @include_areas
|
|
58
58
|
build_section_narrower_relations(collection) if @include_areas
|
|
59
|
+
figures = FigureBuilder.extract!(collection)
|
|
60
|
+
enrich_references(collection)
|
|
59
61
|
save_collection(collection)
|
|
62
|
+
save_figures(figures)
|
|
63
|
+
save_bibliography(BibliographyBuilder.build(collection))
|
|
64
|
+
save_register
|
|
60
65
|
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
|
|
61
66
|
|
|
62
67
|
@stats = {
|
|
63
68
|
concept_count: collection.count,
|
|
64
69
|
localized_count: localized_count(collection),
|
|
70
|
+
figure_count: figures.length,
|
|
65
71
|
elapsed_seconds: elapsed,
|
|
66
72
|
}
|
|
67
73
|
collection
|
|
@@ -140,11 +146,16 @@ module Iev
|
|
|
140
146
|
term = TermBuilder.build_from(row)
|
|
141
147
|
next unless term
|
|
142
148
|
|
|
149
|
+
# Parse IevCode once per concept — used by all helpers below.
|
|
150
|
+
code = IevCode.new(term.id)
|
|
151
|
+
|
|
143
152
|
concept = concept_index[term.id] ||= begin
|
|
144
153
|
c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
|
|
145
154
|
c.uuid = term.id
|
|
146
|
-
c.
|
|
147
|
-
|
|
155
|
+
c.schema_version = "3"
|
|
156
|
+
c.data.domains = domain_references_for(code)
|
|
157
|
+
c.data.tags = tags_for(code)
|
|
158
|
+
add_section_broader(c, code)
|
|
148
159
|
collection.store(c)
|
|
149
160
|
c
|
|
150
161
|
end
|
|
@@ -169,34 +180,125 @@ module Iev
|
|
|
169
180
|
collection.save_grouped_concepts_to_files(concepts_dir.to_s)
|
|
170
181
|
end
|
|
171
182
|
|
|
172
|
-
def
|
|
173
|
-
|
|
183
|
+
def save_figures(figures)
|
|
184
|
+
return if figures.empty?
|
|
185
|
+
|
|
186
|
+
figures_dir = output_dir.expand_path.join("figures")
|
|
187
|
+
FileUtils.mkdir_p(figures_dir)
|
|
188
|
+
figures.each do |figure|
|
|
189
|
+
path = figures_dir.join("#{figure.id}.yaml")
|
|
190
|
+
File.write(path, figure.to_yaml, encoding: "utf-8")
|
|
191
|
+
end
|
|
192
|
+
puts "Written #{figures.length} figures to figures/" if $stdout.tty?
|
|
174
193
|
end
|
|
175
194
|
|
|
176
|
-
|
|
195
|
+
def save_bibliography(bibliography)
|
|
196
|
+
return if bibliography.entries.empty?
|
|
177
197
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
198
|
+
path = output_dir.expand_path.join("bibliography.yaml")
|
|
199
|
+
FileUtils.mkdir_p(path.dirname)
|
|
200
|
+
File.write(path, bibliography.to_yaml, encoding: "utf-8")
|
|
201
|
+
count = bibliography.entries.length
|
|
202
|
+
puts "Written bibliography.yaml with #{count} entries" if $stdout.tty?
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def enrich_references(collection)
|
|
206
|
+
return if collection.none?
|
|
207
|
+
|
|
208
|
+
Glossarist::ConceptEnricher.new.inject_references(collection.to_a)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def save_register
|
|
212
|
+
areas = SubjectAreas.all
|
|
213
|
+
sections = build_section_tree(areas)
|
|
214
|
+
|
|
215
|
+
register = Glossarist::DatasetRegister.new(
|
|
216
|
+
schema_type: "glossarist",
|
|
217
|
+
schema_version: "3",
|
|
218
|
+
id: "iev",
|
|
219
|
+
ref: "IEC 60050:2011",
|
|
220
|
+
year: 2011,
|
|
221
|
+
urn: IEV_SOURCE,
|
|
222
|
+
urn_aliases: ["#{IEV_SOURCE}*"],
|
|
223
|
+
status: "current",
|
|
224
|
+
owner: "IEC",
|
|
225
|
+
source_repo: "https://github.com/glossarist/iev-data",
|
|
226
|
+
tags: %w[electrotechnical vocabulary iec],
|
|
227
|
+
languages: %w[eng fra],
|
|
228
|
+
language_order: %w[eng fra],
|
|
229
|
+
ordering: "systematic",
|
|
230
|
+
sections: sections,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
register_path = output_dir.expand_path.join("register.yaml")
|
|
234
|
+
FileUtils.mkdir_p(register_path.dirname)
|
|
235
|
+
File.write(register_path, register.to_yaml, encoding: "utf-8")
|
|
236
|
+
puts "Written register.yaml with #{sections.length} areas" if $stdout.tty?
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def build_section_tree(areas)
|
|
240
|
+
areas.sort_by { |a| a.code.to_i }.map do |area|
|
|
241
|
+
children = area.sections.sort_by do |s|
|
|
242
|
+
s.code.split("-").map(&:to_i)
|
|
243
|
+
end.map do |sec|
|
|
244
|
+
Glossarist::Section.new(
|
|
245
|
+
id: sec.code,
|
|
246
|
+
names: { "eng" => sec.title },
|
|
247
|
+
)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
Glossarist::Section.new(
|
|
251
|
+
id: area.code,
|
|
252
|
+
names: { "eng" => area.title },
|
|
253
|
+
children: children.empty? ? nil : children,
|
|
186
254
|
)
|
|
187
255
|
end
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
def localized_count(collection)
|
|
259
|
+
collection.sum { |c| c.localized_concepts.count }
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# Build domain ConceptReferences for a concept.
|
|
263
|
+
#
|
|
264
|
+
# Per the concept model, ConceptReferenceType distinguishes:
|
|
265
|
+
# - "domain" → thematic/subject-area classification (area level)
|
|
266
|
+
# - "section" → structural section membership (section level)
|
|
267
|
+
#
|
|
268
|
+
# Every concept gets both: a "domain" ref to its area and a "section"
|
|
269
|
+
# ref to its section. Concepts with only an area code (no section)
|
|
270
|
+
# get only a "domain" ref.
|
|
271
|
+
#
|
|
272
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
273
|
+
# @return [Array<Glossarist::ConceptReference>]
|
|
274
|
+
def domain_references_for(code)
|
|
275
|
+
refs = []
|
|
276
|
+
|
|
277
|
+
# Domain reference: thematic classification at the area level
|
|
278
|
+
refs << domain_ref(code.area_uri)
|
|
279
|
+
|
|
280
|
+
# Section reference: structural membership in the section
|
|
188
281
|
if code.section_code
|
|
189
|
-
refs <<
|
|
190
|
-
concept_id: code.section_uri,
|
|
191
|
-
source: IEV_SOURCE,
|
|
192
|
-
ref_type: "domain",
|
|
193
|
-
)
|
|
282
|
+
refs << section_ref(code.section_uri)
|
|
194
283
|
end
|
|
284
|
+
|
|
195
285
|
refs
|
|
196
286
|
end
|
|
197
287
|
|
|
198
|
-
|
|
199
|
-
|
|
288
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
289
|
+
# @return [Array<String>]
|
|
290
|
+
def tags_for(code)
|
|
291
|
+
tags = []
|
|
292
|
+
area = SubjectAreas.find_area(code.area_code)
|
|
293
|
+
tags << area.title if area
|
|
294
|
+
section = code.section_code && SubjectAreas.find_section(code.section_code)
|
|
295
|
+
tags << section.title if section
|
|
296
|
+
tags
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# @param concept [Glossarist::ManagedConcept]
|
|
300
|
+
# @param code [IevCode] pre-parsed IEV code
|
|
301
|
+
def add_section_broader(concept, code)
|
|
200
302
|
return unless code.section_uri
|
|
201
303
|
|
|
202
304
|
concept.related ||= []
|
|
@@ -249,7 +351,9 @@ module Iev
|
|
|
249
351
|
|
|
250
352
|
concept.related ||= []
|
|
251
353
|
related.each do |r|
|
|
252
|
-
next if concept.related.any?
|
|
354
|
+
next if concept.related.any? do |er|
|
|
355
|
+
er.type == r.type && er.ref&.id == r.ref&.id
|
|
356
|
+
end
|
|
253
357
|
|
|
254
358
|
concept.related << r
|
|
255
359
|
end
|
|
@@ -263,5 +367,19 @@ module Iev
|
|
|
263
367
|
status = term.entry_status
|
|
264
368
|
concept.status = status if status && !status.empty?
|
|
265
369
|
end
|
|
370
|
+
|
|
371
|
+
# --- ConceptReference factory helpers ---
|
|
372
|
+
|
|
373
|
+
def domain_ref(concept_id)
|
|
374
|
+
ref = Glossarist::ConceptReference.domain(concept_id)
|
|
375
|
+
ref.source = IEV_SOURCE
|
|
376
|
+
ref
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def section_ref(concept_id)
|
|
380
|
+
ref = Glossarist::ConceptReference.section(concept_id)
|
|
381
|
+
ref.source = IEV_SOURCE
|
|
382
|
+
ref
|
|
383
|
+
end
|
|
266
384
|
end
|
|
267
385
|
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Iev
|
|
4
|
+
# Hoists IEV figure references into dataset-shared Figure entities.
|
|
5
|
+
#
|
|
6
|
+
# IEV source data carries figures as inline SIMG tags, which Utilities
|
|
7
|
+
# rewrites to AsciiDoc image macros (+image::/assets/images/parts/{area}/
|
|
8
|
+
# FILE[Figure N - caption]+). This builder walks every concept's
|
|
9
|
+
# localizations, finds those image macros, promotes each to a
|
|
10
|
+
# dataset-shared Glossarist::Figure entity, and rewrites the inline text
|
|
11
|
+
# to a V3 figure mention (+{{fig:id, display}}+).
|
|
12
|
+
#
|
|
13
|
+
# The Figure entity is shared across concepts and languages — captions
|
|
14
|
+
# from different localizations merge into the same {lang => text} hash.
|
|
15
|
+
# The structural link from concept to figure is a FigureReference entry
|
|
16
|
+
# on ManagedConceptData#figures.
|
|
17
|
+
#
|
|
18
|
+
# Extraction is destructive: it mutates DetailedDefinition#content and
|
|
19
|
+
# appends FigureReference entries. Returns the unique Figure entities so
|
|
20
|
+
# the exporter can persist them to figures/{id}.yaml.
|
|
21
|
+
module FigureBuilder
|
|
22
|
+
# URL path prefix emitted by Utilities when converting SIMG tags.
|
|
23
|
+
# Kept in sync with Utilities::IMAGE_PATH_PREFIX (without the macro).
|
|
24
|
+
PATH_PREFIX = "/assets/images/parts"
|
|
25
|
+
private_constant :PATH_PREFIX
|
|
26
|
+
|
|
27
|
+
# Matches AsciiDoc image macros emitted by Utilities#process_simg_figures.
|
|
28
|
+
IMAGE_MACRO_REGEX = /
|
|
29
|
+
image::#{Regexp.escape(PATH_PREFIX)}
|
|
30
|
+
\/(?<area>\d+)\/(?<file>[\w.-]+)\[(?<caption>[^\]]*)\]
|
|
31
|
+
/x
|
|
32
|
+
|
|
33
|
+
# Captures "Figure N" label and the trailing caption text.
|
|
34
|
+
CAPTION_REGEX = /\A(?<label>Figure\s+\d+)\s*[–-]\s*(?<text>.+)\z/m
|
|
35
|
+
|
|
36
|
+
module_function
|
|
37
|
+
|
|
38
|
+
# @param collection [Glossarist::ManagedConceptCollection]
|
|
39
|
+
# @return [Array<Glossarist::Figure>] unique figures, sorted by id
|
|
40
|
+
def extract!(collection)
|
|
41
|
+
figures_by_id = {}
|
|
42
|
+
|
|
43
|
+
collection.each do |concept|
|
|
44
|
+
concept.localizations.each do |l10n|
|
|
45
|
+
process_localization(l10n, concept, figures_by_id)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
figures_by_id.values.sort_by(&:id)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def process_localization(l10n, concept, figures_by_id)
|
|
53
|
+
lang = l10n.data&.language_code
|
|
54
|
+
return unless lang && lang.length == 3
|
|
55
|
+
|
|
56
|
+
Glossarist::ConceptData.detailed_definition_fields.each do |field|
|
|
57
|
+
process_field(l10n, field, lang, concept, figures_by_id)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
private_class_method :process_localization
|
|
61
|
+
|
|
62
|
+
def process_field(l10n, field, lang, concept, figures_by_id)
|
|
63
|
+
l10n.data.public_send(field).each do |dd|
|
|
64
|
+
next unless dd.content&.include?("image::")
|
|
65
|
+
|
|
66
|
+
rewritten, hits = extract_from_text(dd.content, lang)
|
|
67
|
+
next if hits.empty?
|
|
68
|
+
|
|
69
|
+
dd.content = rewritten
|
|
70
|
+
hits.each { |hit| register_figure(hit, concept, figures_by_id) }
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
private_class_method :process_field
|
|
74
|
+
|
|
75
|
+
# @return [Array<(String, Array<Hash>)>] rewritten text and per-match
|
|
76
|
+
# figure descriptors ({ id:, identifier:, caption:, lang:, image: })
|
|
77
|
+
def extract_from_text(text, lang)
|
|
78
|
+
hits = []
|
|
79
|
+
rewritten = text.gsub(IMAGE_MACRO_REGEX) do
|
|
80
|
+
hit = build_hit(Regexp.last_match, lang)
|
|
81
|
+
hits << hit
|
|
82
|
+
mention_for(hit)
|
|
83
|
+
end
|
|
84
|
+
[rewritten, hits]
|
|
85
|
+
end
|
|
86
|
+
private_class_method :extract_from_text
|
|
87
|
+
|
|
88
|
+
def build_hit(match, lang)
|
|
89
|
+
identifier, caption = parse_caption(match[:caption])
|
|
90
|
+
{
|
|
91
|
+
id: figure_id_for(match[:file]),
|
|
92
|
+
identifier: identifier,
|
|
93
|
+
caption: caption,
|
|
94
|
+
lang: lang,
|
|
95
|
+
image: build_image(match[:area], match[:file]),
|
|
96
|
+
}
|
|
97
|
+
end
|
|
98
|
+
private_class_method :build_hit
|
|
99
|
+
|
|
100
|
+
def build_image(area, file)
|
|
101
|
+
Glossarist::FigureImage.new(
|
|
102
|
+
src: "#{PATH_PREFIX}/#{area}/#{file}",
|
|
103
|
+
format: format_for(file),
|
|
104
|
+
)
|
|
105
|
+
end
|
|
106
|
+
private_class_method :build_image
|
|
107
|
+
|
|
108
|
+
def parse_caption(bracket)
|
|
109
|
+
stripped = bracket.to_s.strip
|
|
110
|
+
return [nil, nil] if stripped.empty?
|
|
111
|
+
|
|
112
|
+
if (m = stripped.match(CAPTION_REGEX))
|
|
113
|
+
label = m[:label].gsub(/\s+/, " ")
|
|
114
|
+
[label, m[:text].strip]
|
|
115
|
+
elsif stripped.match?(/\AFigure\s+\d+\z/)
|
|
116
|
+
[stripped, nil]
|
|
117
|
+
else
|
|
118
|
+
[nil, stripped]
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
private_class_method :parse_caption
|
|
122
|
+
|
|
123
|
+
def figure_id_for(file)
|
|
124
|
+
"fig-#{file.sub(/\.[^.]+\z/, '')}"
|
|
125
|
+
end
|
|
126
|
+
private_class_method :figure_id_for
|
|
127
|
+
|
|
128
|
+
def format_for(file)
|
|
129
|
+
File.extname(file).delete_prefix(".").downcase
|
|
130
|
+
end
|
|
131
|
+
private_class_method :format_for
|
|
132
|
+
|
|
133
|
+
def mention_for(hit)
|
|
134
|
+
parts = [hit[:identifier], hit[:caption]].compact
|
|
135
|
+
return "{{fig:#{hit[:id]}}}" if parts.empty?
|
|
136
|
+
|
|
137
|
+
"{{fig:#{hit[:id]}, #{parts.join(' - ')}}}"
|
|
138
|
+
end
|
|
139
|
+
private_class_method :mention_for
|
|
140
|
+
|
|
141
|
+
# Add or merge a figure descriptor into the shared index, and ensure the
|
|
142
|
+
# concept carries a FigureReference to it.
|
|
143
|
+
def register_figure(hit, concept, figures_by_id)
|
|
144
|
+
figure = figures_by_id[hit[:id]] ||= build_figure(hit)
|
|
145
|
+
merge_caption!(figure, hit)
|
|
146
|
+
add_image_if_missing(figure, hit[:image])
|
|
147
|
+
add_figure_reference(concept, hit[:id], hit[:identifier])
|
|
148
|
+
end
|
|
149
|
+
private_class_method :register_figure
|
|
150
|
+
|
|
151
|
+
def build_figure(hit)
|
|
152
|
+
Glossarist::Figure.new(
|
|
153
|
+
id: hit[:id],
|
|
154
|
+
identifier: hit[:identifier],
|
|
155
|
+
images: [],
|
|
156
|
+
caption: {},
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
private_class_method :build_figure
|
|
160
|
+
|
|
161
|
+
def merge_caption!(figure, hit)
|
|
162
|
+
return unless hit[:caption]
|
|
163
|
+
|
|
164
|
+
figure.caption ||= {}
|
|
165
|
+
figure.caption[hit[:lang]] ||= hit[:caption]
|
|
166
|
+
end
|
|
167
|
+
private_class_method :merge_caption!
|
|
168
|
+
|
|
169
|
+
def add_image_if_missing(figure, image)
|
|
170
|
+
return if figure.images.any? { |i| i.src == image.src }
|
|
171
|
+
|
|
172
|
+
figure.images << image
|
|
173
|
+
end
|
|
174
|
+
private_class_method :add_image_if_missing
|
|
175
|
+
|
|
176
|
+
def add_figure_reference(concept, figure_id, display)
|
|
177
|
+
refs = Array(concept.data.figures)
|
|
178
|
+
return if refs.any? { |r| r.entity_id == figure_id }
|
|
179
|
+
|
|
180
|
+
concept.data.figures = refs + [
|
|
181
|
+
Glossarist::FigureReference.new(entity_id: figure_id, display: display),
|
|
182
|
+
]
|
|
183
|
+
end
|
|
184
|
+
private_class_method :add_figure_reference
|
|
185
|
+
end
|
|
186
|
+
end
|
data/lib/iev/iso_639_code.rb
CHANGED
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
module Iev
|
|
7
7
|
# @todo This needs to be rewritten.
|
|
8
8
|
class Iso639Code
|
|
9
|
-
COUNTRY_CODES = YAML.
|
|
9
|
+
COUNTRY_CODES = YAML.safe_load_file(File.join(__dir__,
|
|
10
|
+
"iso_639_2.yaml"), permitted_classes: [Symbol]).freeze
|
|
10
11
|
# rubocop:disable Style/MutableConstant
|
|
11
12
|
THREE_CHAR_MEMO = {} # Memoization cache, must be mutable
|
|
12
13
|
# rubocop:enable Style/MutableConstant
|
data/lib/iev/relaton_db.rb
CHANGED
data/lib/iev/scraper/browser.rb
CHANGED
|
@@ -3,100 +3,102 @@
|
|
|
3
3
|
require "ferrum"
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
].freeze
|
|
45
|
-
|
|
46
|
-
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
47
|
-
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
48
|
-
def self.fetch(url, browser_opts: {})
|
|
49
|
-
browser = Ferrum::Browser.new(
|
|
50
|
-
headless: "new",
|
|
51
|
-
timeout: 30,
|
|
52
|
-
window_size: [1366, 768],
|
|
53
|
-
browser_options: {
|
|
54
|
-
"disable-blink-features" => "AutomationControlled",
|
|
6
|
+
class Scraper
|
|
7
|
+
# Shared headless browser utilities for fetching pages behind AWS WAF.
|
|
8
|
+
module Browser
|
|
9
|
+
USER_AGENT_PROFILES = [
|
|
10
|
+
{
|
|
11
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
12
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
13
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
14
|
+
platform: '"macOS"',
|
|
15
|
+
chrome_version: "131",
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
19
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
20
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
21
|
+
platform: '"Windows"',
|
|
22
|
+
chrome_version: "130",
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
26
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
27
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
28
|
+
platform: '"Linux"',
|
|
29
|
+
chrome_version: "131",
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
33
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
34
|
+
"Chrome/129.0.0.0 Safari/537.36",
|
|
35
|
+
platform: '"macOS"',
|
|
36
|
+
chrome_version: "129",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
40
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
41
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
42
|
+
platform: '"Windows"',
|
|
43
|
+
chrome_version: "131",
|
|
55
44
|
},
|
|
56
|
-
|
|
57
|
-
)
|
|
45
|
+
].freeze
|
|
58
46
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
47
|
+
# Fetch a URL using headless Chrome, returning the page HTML.
|
|
48
|
+
# Handles AWS WAF challenge pages by waiting for JS execution.
|
|
49
|
+
def self.fetch(url, browser_opts: {})
|
|
50
|
+
browser = Ferrum::Browser.new(
|
|
51
|
+
headless: "new",
|
|
52
|
+
timeout: 30,
|
|
53
|
+
window_size: [1366, 768],
|
|
54
|
+
browser_options: {
|
|
55
|
+
"disable-blink-features" => "AutomationControlled",
|
|
56
|
+
},
|
|
57
|
+
**browser_opts,
|
|
58
|
+
)
|
|
63
59
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
60
|
+
browser.headers.set(random_headers)
|
|
61
|
+
browser.go_to(url)
|
|
62
|
+
browser.network.wait_for_idle(timeout: 15)
|
|
63
|
+
html = browser.body
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
ensure
|
|
74
|
-
browser&.quit
|
|
75
|
-
end
|
|
65
|
+
if html.include?("403 ERROR") || html.include?("Request blocked")
|
|
66
|
+
warn "IEV: AWS WAF blocked request for #{url}"
|
|
67
|
+
return nil
|
|
68
|
+
end
|
|
76
69
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
70
|
+
html
|
|
71
|
+
rescue Ferrum::Error, Ferrum::BrowserError => e
|
|
72
|
+
warn "IEV: Browser error fetching #{url}: #{e.message}"
|
|
73
|
+
nil
|
|
74
|
+
ensure
|
|
75
|
+
browser&.quit
|
|
76
|
+
end
|
|
82
77
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
78
|
+
def self.random_headers
|
|
79
|
+
profile = USER_AGENT_PROFILES.sample
|
|
80
|
+
sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
|
|
81
|
+
"\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
|
|
82
|
+
"\"Not_A Brand\";v=\"24\""
|
|
83
|
+
|
|
84
|
+
{
|
|
85
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
86
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8," \
|
|
87
|
+
"application/signed-exchange;v=b3;q=0.7",
|
|
88
|
+
"Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
|
|
89
|
+
"Cache-Control" => "no-cache",
|
|
90
|
+
"Pragma" => "no-cache",
|
|
91
|
+
"Sec-Ch-Ua" => sec_ch_ua,
|
|
92
|
+
"Sec-Ch-Ua-Mobile" => "?0",
|
|
93
|
+
"Sec-Ch-Ua-Platform" => profile[:platform],
|
|
94
|
+
"Sec-Fetch-Dest" => "document",
|
|
95
|
+
"Sec-Fetch-Mode" => "navigate",
|
|
96
|
+
"Sec-Fetch-Site" => "cross-site",
|
|
97
|
+
"Sec-Fetch-User" => "?1",
|
|
98
|
+
"Upgrade-Insecure-Requests" => "1",
|
|
99
|
+
"User-Agent" => profile[:user_agent],
|
|
100
|
+
}
|
|
101
|
+
end
|
|
100
102
|
end
|
|
101
103
|
end
|
|
102
104
|
end
|
data/lib/iev/scraper.rb
CHANGED
|
@@ -4,6 +4,9 @@ require "nokogiri"
|
|
|
4
4
|
|
|
5
5
|
module Iev
|
|
6
6
|
class Scraper
|
|
7
|
+
autoload :Browser, "iev/scraper/browser"
|
|
8
|
+
autoload :PageParser, "iev/scraper/page_parser"
|
|
9
|
+
|
|
7
10
|
BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
|
|
8
11
|
"display?openform&ievref="
|
|
9
12
|
|
|
@@ -14,7 +17,8 @@ module Iev
|
|
|
14
17
|
# Fetch the Electropedia page HTML for a given IEV code.
|
|
15
18
|
# Returns a Nokogiri document.
|
|
16
19
|
def fetch_page(code)
|
|
17
|
-
html =
|
|
20
|
+
html = Browser.fetch("#{BASE_URL}#{code}",
|
|
21
|
+
browser_opts: @browser_opts)
|
|
18
22
|
return nil unless html
|
|
19
23
|
|
|
20
24
|
Nokogiri::HTML(html)
|
|
@@ -30,6 +34,3 @@ module Iev
|
|
|
30
34
|
end
|
|
31
35
|
end
|
|
32
36
|
end
|
|
33
|
-
|
|
34
|
-
require_relative "scraper/browser"
|
|
35
|
-
require_relative "scraper/page_parser"
|