relaton-w3c 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 36c34b0daf914dbc2cf41cd4fe4842fc04d67147d977ac1282b9d7f0452538d7
4
- data.tar.gz: f6f0fdf9774a4bdb798b4d00e0ba4dae18e6d9ec772ce80d6604d5666fb22893
3
+ metadata.gz: 46f27195b30d4b285034731e6749b4ed4aff9ed607d41530a1d5b9d07c859b22
4
+ data.tar.gz: a263250dc4bda6a871af9addecbe37c5e3d62ab748797fe064dc9545e90eec8a
5
5
  SHA512:
6
- metadata.gz: a6b3105b9b30556d46badf314f8c298718422c2cda7b9e015af0469cf64d99205238c7e786b1d7d429277ef2f58498241f99a2f170463ff273f94a53e1d58857
7
- data.tar.gz: f51e0ee34082f78e51d0da502087ecd4ca7b7cb4e1903f7b2db20123ebddaf56d57c655c057825022df459f02e50ade7b1cbb7d3ad34aff4c621240aeebe6006
6
+ metadata.gz: 973bcf91864d27cb1f19f6001733dc0b46c66738d1e6614ff91313a61dcf03b6a721cbc8c82fb3bd64a00dccc8af50dd2efbf23d051ada503d4b8c698719bb68
7
+ data.tar.gz: 89fb5bcce023f69488932d901fbe260450bf1fcd6164c46be7731fb107bcb47a9aec30b9eb08a7fe534822727dcdb48c262e2386cf7f921ce018f0537fb33377
@@ -10,6 +10,20 @@ module Relaton
10
10
  class DataFetcher < Core::DataFetcher
11
11
  include Relaton::W3c::RateLimitHandler
12
12
 
13
+ DEFAULT_CONCURRENCY = 8
14
+
15
+ # Number of fetch_spec worker threads. Tunable via env var so CI or
16
+ # local runs can dial it down (e.g. for debugging or to lighten load
17
+ # on api.w3.org).
18
+ def self.concurrency
19
+ (ENV["RELATON_W3C_FETCH_CONCURRENCY"] || DEFAULT_CONCURRENCY).to_i
20
+ end
21
+
22
+ def initialize(*args)
23
+ super
24
+ @mutex = Mutex.new
25
+ end
26
+
13
27
  def index
14
28
  @index ||= Relaton::Index.find_or_create(:W3C, file: "#{INDEXFILE}.yaml")
15
29
  end
@@ -23,41 +37,54 @@ module Relaton
23
37
  end
24
38
 
25
39
  #
26
- # Parse documents
40
+ # Parse documents in parallel. The crawler is heavily I/O-bound on
41
+ # api.w3.org round-trips (~30-50k requests per run), so a small thread
42
+ # pool gives a near-linear speedup. Pagination still happens serially
43
+ # because each page depends on the previous response's `next` link.
27
44
  #
28
45
  def fetch(_source = nil)
46
+ n_workers = self.class.concurrency
47
+ queue = SizedQueue.new(n_workers * 4)
48
+ workers = Array.new(n_workers) { spawn_worker(queue) }
49
+
29
50
  specs = client.specifications
30
51
  loop do
31
- specs.links.specifications.each do |spec|
32
- fetch_spec spec
33
- end
34
-
52
+ specs.links.specifications.each { |spec| queue << spec }
35
53
  break unless specs.next?
36
54
 
37
55
  specs = specs.next
38
56
  end
57
+
58
+ n_workers.times { queue << nil } # poison pills
59
+ workers.each(&:join)
60
+
39
61
  index.save
40
62
  report_errors
41
63
  end
42
64
 
43
65
  def fetch_spec(unrealized_spec)
44
66
  spec = realize unrealized_spec
45
- save_doc DataParser.parse(spec, @errors)
67
+ return unless spec
68
+
69
+ local_errors = Hash.new(true)
70
+ save_doc DataParser.parse(spec, local_errors)
46
71
 
47
72
  if spec.links.respond_to?(:version_history) && spec.links.version_history
48
73
  version_history = realize spec.links.version_history
49
- version_history.links.spec_versions.each { |version| save_doc DataParser.parse(realize version) }
74
+ version_history&.links&.spec_versions&.each { |version| parse_and_save version }
50
75
  end
51
76
 
52
77
  if spec.links.respond_to?(:predecessor_versions) && spec.links.predecessor_versions
53
78
  predecessor_versions = realize spec.links.predecessor_versions
54
- predecessor_versions.links.predecessor_versions.each { |version| save_doc DataParser.parse(realize version) }
79
+ predecessor_versions&.links&.predecessor_versions&.each { |version| parse_and_save version }
55
80
  end
56
81
 
57
82
  if spec.links.respond_to?(:successor_versions) && spec.links.successor_versions
58
83
  successor_versions = realize spec.links.successor_versions
59
- successor_versions.links.successor_versions.each { |version| save_doc DataParser.parse(realize version) }
84
+ successor_versions&.links&.successor_versions&.each { |version| parse_and_save version }
60
85
  end
86
+
87
+ @mutex.synchronize { local_errors.each { |k, v| @errors[k] &&= v } }
61
88
  end
62
89
 
63
90
  #
@@ -69,14 +96,16 @@ module Relaton
69
96
  return unless bib
70
97
 
71
98
  file = file_name(bib.docnumber)
72
- if @files.include?(file)
73
- Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
74
- else
75
- pubid = PubId.parse bib.docnumber
76
- index.add_or_update pubid.to_hash, file
77
- @files << file
99
+ @mutex.synchronize do
100
+ if @files.include?(file)
101
+ Util.warn "File #{file} already exists. Document: #{bib.docnumber}" if warn_duplicate
102
+ else
103
+ pubid = PubId.parse bib.docnumber
104
+ index.add_or_update pubid.to_hash, file
105
+ @files << file
106
+ end
107
+ File.write file, serialize(bib), encoding: "UTF-8"
78
108
  end
79
- File.write file, serialize(bib), encoding: "UTF-8"
80
109
  end
81
110
 
82
111
  def to_xml(bib)
@@ -102,6 +131,26 @@ module Relaton
102
131
  name = id.sub(/^W3C\s/, "").gsub(/[\s,:\/+]/, "_").squeeze("_").downcase
103
132
  File.join @output, "#{name}.#{@ext}"
104
133
  end
134
+
135
+ private
136
+
137
+ def spawn_worker(queue)
138
+ Thread.new do
139
+ while (spec = queue.pop)
140
+ begin
141
+ fetch_spec spec
142
+ rescue StandardError => e
143
+ log_error "fetch_spec failed: #{e.class}: #{e.message}\n" \
144
+ "#{e.backtrace.first(5).join("\n")}"
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ def parse_and_save(version)
151
+ realized = realize version
152
+ save_doc DataParser.parse(realized) if realized
153
+ end
105
154
  end
106
155
  end
107
156
  end
@@ -236,10 +236,11 @@ module Relaton
236
236
  def parse_relation
237
237
  result = if @spec.links.respond_to?(:version_history)
238
238
  version_history = realize @spec.links.version_history
239
- version_history.links.spec_versions.map { |version| create_relation(version, "hasEdition") }
239
+ version_history&.links&.spec_versions&.map { |version| create_relation(version, "hasEdition") } || []
240
240
  else
241
241
  relations
242
242
  end
243
+ result = result.compact
243
244
  @errors[:relation] &&= result.empty?
244
245
  result
245
246
  end
@@ -254,17 +255,17 @@ module Relaton
254
255
  rels << create_relation(@spec.links.specification, "editionOf") if @spec.links.respond_to?(:specification)
255
256
  if @spec.links.respond_to?(:predecessor_versions) && @spec.links.predecessor_versions
256
257
  predecessor_versions = realize @spec.links.predecessor_versions
257
- predecessor_versions.links.predecessor_versions.each do |version|
258
+ predecessor_versions&.links&.predecessor_versions&.each do |version|
258
259
  rels << create_relation(version, "obsoletes")
259
260
  end
260
261
  end
261
262
  if @spec.links.respond_to?(:successor_versions) && @spec.links.successor_versions
262
263
  successor_versions = realize @spec.links.successor_versions
263
- successor_versions.links.successor_versions.each do |version|
264
+ successor_versions&.links&.successor_versions&.each do |version|
264
265
  rels << create_relation(version, "updatedBy", "errata")
265
266
  end
266
267
  end
267
- rels
268
+ rels.compact
268
269
  end
269
270
 
270
271
  #
@@ -278,6 +279,8 @@ module Relaton
278
279
  #
279
280
  def create_relation(version, type, desc = nil)
280
281
  version_spec = realize version
282
+ return nil unless version_spec
283
+
281
284
  url = doc_uri(version_spec)
282
285
  id = pub_id(url)
283
286
  title = parse_title(version_spec)
@@ -314,7 +317,7 @@ module Relaton
314
317
 
315
318
  if @spec.links.respond_to?(:editors)
316
319
  editors = realize @spec.links.editors
317
- editors.links.editors&.each do |ed|
320
+ editors&.links&.editors&.each do |ed|
318
321
  editor = create_editor(ed)
319
322
  contribs << editor if editor
320
323
  end
@@ -1,3 +1,5 @@
1
+ require "concurrent/map"
2
+
1
3
  module Relaton
2
4
  module W3c
3
5
  module RateLimitHandler
@@ -7,8 +9,11 @@ module Relaton
7
9
  Lutaml::Hal::ServerError, Faraday::ConnectionFailed, Net::OpenTimeout,
8
10
  ].freeze
9
11
 
12
+ # Concurrent::Map so multiple fetcher threads can hit the cache without
13
+ # a global lock. Duplicate concurrent fetches of the same URL are
14
+ # possible but harmless; the second write just replaces the first.
10
15
  def self.fetched_objects
11
- @fetched_objects ||= {}
16
+ @fetched_objects ||= Concurrent::Map.new
12
17
  end
13
18
 
14
19
  def realize(obj)
@@ -38,6 +43,12 @@ module Relaton
38
43
  rescue Lutaml::Hal::NotFoundError
39
44
  Util.warn "Object not found: #{href}"
40
45
  RateLimitHandler.fetched_objects[href] = nil
46
+ rescue Lutaml::Hal::Error => e
47
+ # Non-retryable client-side errors (403/401/400 and any other
48
+ # Lutaml::Hal::Error not matched above) — skip the resource and
49
+ # continue rather than aborting the whole crawl.
50
+ Util.warn "Client error for #{href}, skipping: #{e.message}"
51
+ RateLimitHandler.fetched_objects[href] = nil
41
52
  end
42
53
  end
43
54
 
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module W3c
3
- VERSION = "2.1.1".freeze
3
+ VERSION = "2.1.2".freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-w3c
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-05 00:00:00.000000000 Z
11
+ date: 2026-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: linkeddata