relaton-w3c 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton/w3c/data_fetcher.rb +65 -16
- data/lib/relaton/w3c/data_parser.rb +8 -5
- data/lib/relaton/w3c/rate_limit_handler.rb +12 -1
- data/lib/relaton/w3c/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 46f27195b30d4b285034731e6749b4ed4aff9ed607d41530a1d5b9d07c859b22
|
|
4
|
+
data.tar.gz: a263250dc4bda6a871af9addecbe37c5e3d62ab748797fe064dc9545e90eec8a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 973bcf91864d27cb1f19f6001733dc0b46c66738d1e6614ff91313a61dcf03b6a721cbc8c82fb3bd64a00dccc8af50dd2efbf23d051ada503d4b8c698719bb68
|
|
7
|
+
data.tar.gz: 89fb5bcce023f69488932d901fbe260450bf1fcd6164c46be7731fb107bcb47a9aec30b9eb08a7fe534822727dcdb48c262e2386cf7f921ce018f0537fb33377
|
|
@@ -10,6 +10,20 @@ module Relaton
|
|
|
10
10
|
class DataFetcher < Core::DataFetcher
|
|
11
11
|
include Relaton::W3c::RateLimitHandler
|
|
12
12
|
|
|
13
|
+
DEFAULT_CONCURRENCY = 8
|
|
14
|
+
|
|
15
|
+
# Number of fetch_spec worker threads. Tunable via env var so CI or
|
|
16
|
+
# local runs can dial it down (e.g. for debugging or to lighten load
|
|
17
|
+
# on api.w3.org).
|
|
18
|
+
def self.concurrency
|
|
19
|
+
(ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def initialize(*args)
|
|
23
|
+
super
|
|
24
|
+
@mutex = Mutex.new
|
|
25
|
+
end
|
|
26
|
+
|
|
13
27
|
def index
|
|
14
28
|
@index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml")
|
|
15
29
|
end
|
|
@@ -23,41 +37,54 @@ module Relaton
|
|
|
23
37
|
end
|
|
24
38
|
|
|
25
39
|
#
|
|
26
|
-
# Parse documents
|
|
40
|
+
# Parse documents in parallel. The crawler is heavily I/O-bound on
|
|
41
|
+
# api.w3.org round-trips (~30-50k requests per run), so a small thread
|
|
42
|
+
# pool gives a near-linear speedup. Pagination still happens serially
|
|
43
|
+
# because each page depends on the previous response's `next` link.
|
|
27
44
|
#
|
|
28
45
|
def fetch(_source = nil)
|
|
46
|
+
n_workers = self.class.concurrency
|
|
47
|
+
queue = SizedQueue.new(n_workers * 4)
|
|
48
|
+
workers = Array.new(n_workers) { spawn_worker(queue) }
|
|
49
|
+
|
|
29
50
|
specs = client.specifications
|
|
30
51
|
loop do
|
|
31
|
-
specs.links.specifications.each
|
|
32
|
-
fetch_spec spec
|
|
33
|
-
end
|
|
34
|
-
|
|
52
|
+
specs.links.specifications.each { |spec| queue << spec }
|
|
35
53
|
break unless specs.next?
|
|
36
54
|
|
|
37
55
|
specs = specs.next
|
|
38
56
|
end
|
|
57
|
+
|
|
58
|
+
n_workers.times { queue << nil } # poison pills
|
|
59
|
+
workers.each(&:join)
|
|
60
|
+
|
|
39
61
|
index.save
|
|
40
62
|
report_errors
|
|
41
63
|
end
|
|
42
64
|
|
|
43
65
|
def fetch_spec(unrealized_spec)
|
|
44
66
|
spec = realize unrealized_spec
|
|
45
|
-
|
|
67
|
+
return unless spec
|
|
68
|
+
|
|
69
|
+
local_errors = Hash.new(true)
|
|
70
|
+
save_doc DataParser.parse(spec, local_errors)
|
|
46
71
|
|
|
47
72
|
if spec.links.respond_to?(:version_history) && spec.links.version_history
|
|
48
73
|
version_history = realize spec.links.version_history
|
|
49
|
-
version_history
|
|
74
|
+
version_history&.links&.spec_versions&.each { |version| parse_and_save version }
|
|
50
75
|
end
|
|
51
76
|
|
|
52
77
|
if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
|
|
53
78
|
predecessor_versions = realize spec.links.predecessor_versions
|
|
54
|
-
predecessor_versions
|
|
79
|
+
predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version }
|
|
55
80
|
end
|
|
56
81
|
|
|
57
82
|
if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
|
|
58
83
|
successor_versions = realize spec.links.successor_versions
|
|
59
|
-
successor_versions
|
|
84
|
+
successor_versions&.links&.successor_versions&.each { |version| parse_and_save version }
|
|
60
85
|
end
|
|
86
|
+
|
|
87
|
+
@mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } }
|
|
61
88
|
end
|
|
62
89
|
|
|
63
90
|
#
|
|
@@ -69,14 +96,16 @@ module Relaton
|
|
|
69
96
|
return unless bib
|
|
70
97
|
|
|
71
98
|
file = file_name(bib.docnumber)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
99
|
+
@mutex.synchronize do
|
|
100
|
+
if @files.include?(file)
|
|
101
|
+
Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
|
|
102
|
+
else
|
|
103
|
+
pubid = PubId.parse bib.docnumber
|
|
104
|
+
index.add_or_update pubid.to_hash, file
|
|
105
|
+
@files << file
|
|
106
|
+
end
|
|
107
|
+
File.write file, serialize(bib), encoding: "UTF-8"
|
|
78
108
|
end
|
|
79
|
-
File.write file, serialize(bib), encoding: "UTF-8"
|
|
80
109
|
end
|
|
81
110
|
|
|
82
111
|
def to_xml(bib)
|
|
@@ -102,6 +131,26 @@ module Relaton
|
|
|
102
131
|
name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
|
|
103
132
|
File.join @output, "#{name}.#{@ext}"
|
|
104
133
|
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def spawn_worker(queue)
|
|
138
|
+
Thread.new do
|
|
139
|
+
while (spec = queue.pop)
|
|
140
|
+
begin
|
|
141
|
+
fetch_spec spec
|
|
142
|
+
rescue StandardError => e
|
|
143
|
+
log_error "fetch_spec failed: #{e.class}: #{e.message}\n" \
|
|
144
|
+
"#{e.backtrace.first(5).join("\n")}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def parse_and_save(version)
|
|
151
|
+
realized = realize version
|
|
152
|
+
save_doc DataParser.parse(realized) if realized
|
|
153
|
+
end
|
|
105
154
|
end
|
|
106
155
|
end
|
|
107
156
|
end
|
|
@@ -236,10 +236,11 @@ module Relaton
|
|
|
236
236
|
def parse_relation
|
|
237
237
|
result = if @spec.links.respond_to?(:version_history)
|
|
238
238
|
version_history = realize @spec.links.version_history
|
|
239
|
-
version_history
|
|
239
|
+
version_history&.links&.spec_versions&.map { |version| create_relation(version, "hasEdition") } || []
|
|
240
240
|
else
|
|
241
241
|
relations
|
|
242
242
|
end
|
|
243
|
+
result = result.compact
|
|
243
244
|
@errors[:relation] &&= result.empty?
|
|
244
245
|
result
|
|
245
246
|
end
|
|
@@ -254,17 +255,17 @@ module Relaton
|
|
|
254
255
|
rels << create_relation(@spec.links.specification, "editionOf") if @spec.links.respond_to?(:specification)
|
|
255
256
|
if @spec.links.respond_to?(:predecessor_versions) && @spec.links.predecessor_versions
|
|
256
257
|
predecessor_versions = realize @spec.links.predecessor_versions
|
|
257
|
-
predecessor_versions
|
|
258
|
+
predecessor_versions&.links&.predecessor_versions&.each do |version|
|
|
258
259
|
rels << create_relation(version, "obsoletes")
|
|
259
260
|
end
|
|
260
261
|
end
|
|
261
262
|
if @spec.links.respond_to?(:successor_versions) && @spec.links.successor_versions
|
|
262
263
|
successor_versions = realize @spec.links.successor_versions
|
|
263
|
-
successor_versions
|
|
264
|
+
successor_versions&.links&.successor_versions&.each do |version|
|
|
264
265
|
rels << create_relation(version, "updatedBy", "errata")
|
|
265
266
|
end
|
|
266
267
|
end
|
|
267
|
-
rels
|
|
268
|
+
rels.compact
|
|
268
269
|
end
|
|
269
270
|
|
|
270
271
|
#
|
|
@@ -278,6 +279,8 @@ module Relaton
|
|
|
278
279
|
#
|
|
279
280
|
def create_relation(version, type, desc = nil)
|
|
280
281
|
version_spec = realize version
|
|
282
|
+
return nil unless version_spec
|
|
283
|
+
|
|
281
284
|
url = doc_uri(version_spec)
|
|
282
285
|
id = pub_id(url)
|
|
283
286
|
title = parse_title(version_spec)
|
|
@@ -314,7 +317,7 @@ module Relaton
|
|
|
314
317
|
|
|
315
318
|
if @spec.links.respond_to?(:editors)
|
|
316
319
|
editors = realize @spec.links.editors
|
|
317
|
-
editors
|
|
320
|
+
editors&.links&.editors&.each do |ed|
|
|
318
321
|
editor = create_editor(ed)
|
|
319
322
|
contribs << editor if editor
|
|
320
323
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
require "concurrent/map"
|
|
2
|
+
|
|
1
3
|
module Relaton
|
|
2
4
|
module W3c
|
|
3
5
|
module RateLimitHandler
|
|
@@ -7,8 +9,11 @@ module Relaton
|
|
|
7
9
|
Lutaml::Hal::ServerError, Faraday::ConnectionFailed, Net::OpenTimeout,
|
|
8
10
|
].freeze
|
|
9
11
|
|
|
12
|
+
# Concurrent::Map so multiple fetcher threads can hit the cache without
|
|
13
|
+
# a global lock. Duplicate concurrent fetches of the same URL are
|
|
14
|
+
# possible but harmless; the second write just replaces the first.
|
|
10
15
|
def self.fetched_objects
|
|
11
|
-
@fetched_objects ||=
|
|
16
|
+
@fetched_objects ||= Concurrent::Map.new
|
|
12
17
|
end
|
|
13
18
|
|
|
14
19
|
def realize(obj)
|
|
@@ -38,6 +43,12 @@ module Relaton
|
|
|
38
43
|
rescue Lutaml::Hal::NotFoundError
|
|
39
44
|
Util.warn "Object not found: #{href}"
|
|
40
45
|
RateLimitHandler.fetched_objects[href] = nil
|
|
46
|
+
rescue Lutaml::Hal::Error => e
|
|
47
|
+
# Non-retryable client-side errors (403/401/400 and any other
|
|
48
|
+
# Lutaml::Hal::Error not matched above) — skip the resource and
|
|
49
|
+
# continue rather than aborting the whole crawl.
|
|
50
|
+
Util.warn "Client error for #{href}, skipping: #{e.message}"
|
|
51
|
+
RateLimitHandler.fetched_objects[href] = nil
|
|
41
52
|
end
|
|
42
53
|
end
|
|
43
54
|
|
data/lib/relaton/w3c/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-w3c
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: linkeddata
|