relaton-w3c 2.1.2 → 2.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +15 -6
- data/Gemfile +1 -0
- data/lib/relaton/w3c/data_fetcher.rb +8 -3
- data/lib/relaton/w3c/data_parser.rb +1 -1
- data/lib/relaton/w3c/safe_realize.rb +55 -0
- data/lib/relaton/w3c/version.rb +1 -1
- data/relaton-w3c.gemspec +1 -9
- metadata +5 -122
- data/grammars/basicdoc.rng +0 -2140
- data/grammars/biblio-standoc.rng +0 -268
- data/grammars/biblio.rng +0 -2125
- data/grammars/relaton-w3c-compile.rng +0 -11
- data/grammars/relaton-w3c.rng +0 -11
- data/lib/relaton/w3c/rate_limit_handler.rb +0 -62
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
-
<grammar xmlns="http://relaxng.org/ns/structure/1.0">
|
|
3
|
-
<include href="basicdoc.rng"/>
|
|
4
|
-
<include href="relaton-w3c.rng"/>
|
|
5
|
-
<start>
|
|
6
|
-
<choice>
|
|
7
|
-
<ref name="bibitem"/>
|
|
8
|
-
<ref name="bibdata"/>
|
|
9
|
-
</choice>
|
|
10
|
-
</start>
|
|
11
|
-
</grammar>
|
data/grammars/relaton-w3c.rng
DELETED
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
-
<grammar xmlns="http://relaxng.org/ns/structure/1.0">
|
|
3
|
-
<include href="biblio-standoc.rng">
|
|
4
|
-
<define name="DocumentType">
|
|
5
|
-
<choice>
|
|
6
|
-
<value>groupNote</value>
|
|
7
|
-
<value>technicalReport</value>
|
|
8
|
-
</choice>
|
|
9
|
-
</define>
|
|
10
|
-
</include>
|
|
11
|
-
</grammar>
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
require "concurrent/map"
|
|
2
|
-
|
|
3
|
-
module Relaton
|
|
4
|
-
module W3c
|
|
5
|
-
module RateLimitHandler
|
|
6
|
-
MAX_RETRIES = 5
|
|
7
|
-
RETRYABLE_ERRORS = [
|
|
8
|
-
NameError, Lutaml::Hal::ConnectionError, Lutaml::Hal::TimeoutError,
|
|
9
|
-
Lutaml::Hal::ServerError, Faraday::ConnectionFailed, Net::OpenTimeout,
|
|
10
|
-
].freeze
|
|
11
|
-
|
|
12
|
-
# Concurrent::Map so multiple fetcher threads can hit the cache without
|
|
13
|
-
# a global lock. Duplicate concurrent fetches of the same URL are
|
|
14
|
-
# possible but harmless; the second write just replaces the first.
|
|
15
|
-
def self.fetched_objects
|
|
16
|
-
@fetched_objects ||= Concurrent::Map.new
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
def realize(obj)
|
|
20
|
-
href = resolve_href(obj)
|
|
21
|
-
return RateLimitHandler.fetched_objects[href] if RateLimitHandler.fetched_objects.key?(href)
|
|
22
|
-
|
|
23
|
-
attempt = 1
|
|
24
|
-
begin
|
|
25
|
-
RateLimitHandler.fetched_objects[href] = obj.realize
|
|
26
|
-
rescue *RETRYABLE_ERRORS => e
|
|
27
|
-
if attempt < MAX_RETRIES
|
|
28
|
-
sleep_time = attempt * attempt
|
|
29
|
-
attempt += 1
|
|
30
|
-
Util.warn "Rate limit exceeded for #{href}, retrying in #{sleep_time} seconds..."
|
|
31
|
-
sleep sleep_time
|
|
32
|
-
retry
|
|
33
|
-
elsif e.is_a?(Lutaml::Hal::ServerError)
|
|
34
|
-
# Persistent 5xx — cache nil so a permanently broken upstream
|
|
35
|
-
# resource is skipped on the next lookup instead of re-tried.
|
|
36
|
-
Util.warn "Server error for #{href}, skipping: #{e.message}"
|
|
37
|
-
RateLimitHandler.fetched_objects[href] = nil
|
|
38
|
-
else
|
|
39
|
-
# Do not cache on retries exhausted — transient failures should not
|
|
40
|
-
# permanently poison the cache; subsequent calls will retry fresh.
|
|
41
|
-
Util.warn "Failed to realize object: #{href}, error: #{e.message}"
|
|
42
|
-
end
|
|
43
|
-
rescue Lutaml::Hal::NotFoundError
|
|
44
|
-
Util.warn "Object not found: #{href}"
|
|
45
|
-
RateLimitHandler.fetched_objects[href] = nil
|
|
46
|
-
rescue Lutaml::Hal::Error => e
|
|
47
|
-
# Non-retryable client-side errors (403/401/400 and any other
|
|
48
|
-
# Lutaml::Hal::Error not matched above) — skip the resource and
|
|
49
|
-
# continue rather than aborting the whole crawl.
|
|
50
|
-
Util.warn "Client error for #{href}, skipping: #{e.message}"
|
|
51
|
-
RateLimitHandler.fetched_objects[href] = nil
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
private
|
|
56
|
-
|
|
57
|
-
def resolve_href(obj)
|
|
58
|
-
obj.href || obj.links.self.href
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|