relaton-iso 1.20.0 → 2.0.0.pre.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile +1 -0
- data/README.adoc +134 -130
- data/bin/console +1 -1
- data/grammars/basicdoc.rng +2110 -0
- data/grammars/biblio-standoc.rng +287 -0
- data/grammars/biblio.rng +2097 -0
- data/grammars/relaton-iso-compile.rng +11 -0
- data/grammars/relaton-iso.rng +214 -0
- data/lib/relaton/iso/bibliography.rb +206 -0
- data/lib/relaton/iso/data_fetcher.rb +227 -0
- data/lib/relaton/iso/hash_parser_v1.rb +121 -0
- data/lib/relaton/iso/hit.rb +62 -0
- data/lib/relaton/iso/hit_collection.rb +117 -0
- data/lib/relaton/iso/item_data.rb +49 -0
- data/lib/relaton/iso/model/bibdata.rb +9 -0
- data/lib/relaton/iso/model/bibitem.rb +7 -0
- data/lib/relaton/iso/model/contributor.rb +7 -0
- data/lib/relaton/iso/model/contributor_info.rb +9 -0
- data/lib/relaton/iso/model/docidentifier.rb +128 -0
- data/lib/relaton/iso/model/doctype.rb +13 -0
- data/lib/relaton/iso/model/ext.rb +47 -0
- data/lib/relaton/iso/model/iso_project_group.rb +21 -0
- data/lib/relaton/iso/model/item.rb +17 -0
- data/lib/relaton/iso/model/item_base.rb +19 -0
- data/lib/relaton/iso/model/organization.rb +9 -0
- data/lib/relaton/iso/model/project_number.rb +22 -0
- data/lib/relaton/iso/model/relation.rb +9 -0
- data/lib/relaton/iso/model/stagename.rb +14 -0
- data/lib/relaton/iso/model/structured_identifier.rb +31 -0
- data/lib/relaton/iso/processor.rb +78 -0
- data/lib/relaton/iso/queue.rb +63 -0
- data/lib/relaton/iso/scraper.rb +591 -0
- data/lib/relaton/iso/util.rb +8 -0
- data/lib/relaton/iso/version.rb +7 -0
- data/lib/relaton/iso.rb +17 -0
- data/relaton_iso.gemspec +9 -7
- metadata +76 -46
- data/bin/bundle +0 -109
- data/bin/byebug +0 -27
- data/bin/coderay +0 -27
- data/bin/gdb_wrapper +0 -29
- data/bin/htmldiff +0 -27
- data/bin/httpclient +0 -29
- data/bin/ldiff +0 -27
- data/bin/nokogiri +0 -27
- data/bin/pry +0 -27
- data/bin/pubid-nist +0 -27
- data/bin/racc +0 -27
- data/bin/rackup +0 -29
- data/bin/rake +0 -27
- data/bin/rubocop +0 -27
- data/bin/ruby-parse +0 -27
- data/bin/ruby-rewrite +0 -27
- data/bin/safe_yaml +0 -29
- data/bin/thor +0 -27
- data/lib/relaton_iso/data_fetcher.rb +0 -246
- data/lib/relaton_iso/document_identifier.rb +0 -46
- data/lib/relaton_iso/hash_converter.rb +0 -15
- data/lib/relaton_iso/hit.rb +0 -59
- data/lib/relaton_iso/hit_collection.rb +0 -100
- data/lib/relaton_iso/iso_bibliography.rb +0 -202
- data/lib/relaton_iso/processor.rb +0 -67
- data/lib/relaton_iso/queue.rb +0 -61
- data/lib/relaton_iso/scrapper.rb +0 -553
- data/lib/relaton_iso/util.rb +0 -6
- data/lib/relaton_iso/version.rb +0 -5
- data/lib/relaton_iso.rb +0 -17
@@ -0,0 +1,11 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<grammar xmlns="http://relaxng.org/ns/structure/1.0">
|
3
|
+
<include href="basicdoc.rng"/>
|
4
|
+
<include href="relaton-iso.rng"/>
|
5
|
+
<start>
|
6
|
+
<choice>
|
7
|
+
<ref name="bibitem"/>
|
8
|
+
<ref name="bibdata"/>
|
9
|
+
</choice>
|
10
|
+
</start>
|
11
|
+
</grammar>
|
@@ -0,0 +1,214 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
|
3
|
+
<include href="biblio-standoc.rng">
|
4
|
+
<define name="BibDataExtensionType">
|
5
|
+
<optional>
|
6
|
+
<attribute name="schema-version"/>
|
7
|
+
</optional>
|
8
|
+
<ref name="doctype"/>
|
9
|
+
<optional>
|
10
|
+
<ref name="docsubtype"/>
|
11
|
+
</optional>
|
12
|
+
<ref name="flavor"/>
|
13
|
+
<optional>
|
14
|
+
<ref name="horizontal"/>
|
15
|
+
</optional>
|
16
|
+
<ref name="editorialgroup"/>
|
17
|
+
<optional>
|
18
|
+
<ref name="approvalgroup"/>
|
19
|
+
</optional>
|
20
|
+
<zeroOrMore>
|
21
|
+
<ref name="ics"/>
|
22
|
+
</zeroOrMore>
|
23
|
+
<ref name="structuredidentifier"/>
|
24
|
+
<optional>
|
25
|
+
<ref name="stagename"/>
|
26
|
+
</optional>
|
27
|
+
<optional>
|
28
|
+
<ref name="updates_document_type"/>
|
29
|
+
</optional>
|
30
|
+
<optional>
|
31
|
+
<ref name="fast_track"/>
|
32
|
+
</optional>
|
33
|
+
<optional>
|
34
|
+
<ref name="price-code"/>
|
35
|
+
</optional>
|
36
|
+
</define>
|
37
|
+
<define name="bdate">
|
38
|
+
<element name="date">
|
39
|
+
<attribute name="type">
|
40
|
+
<choice>
|
41
|
+
<ref name="BibliographicDateType"/>
|
42
|
+
<text/>
|
43
|
+
</choice>
|
44
|
+
</attribute>
|
45
|
+
<choice>
|
46
|
+
<group>
|
47
|
+
<element name="from">
|
48
|
+
<ref name="ISO8601Date"/>
|
49
|
+
</element>
|
50
|
+
<optional>
|
51
|
+
<element name="to">
|
52
|
+
<ref name="ISO8601Date"/>
|
53
|
+
</element>
|
54
|
+
</optional>
|
55
|
+
</group>
|
56
|
+
<element name="on">
|
57
|
+
<choice>
|
58
|
+
<ref name="ISO8601Date"/>
|
59
|
+
<value>--</value>
|
60
|
+
<value>–</value>
|
61
|
+
</choice>
|
62
|
+
</element>
|
63
|
+
</choice>
|
64
|
+
</element>
|
65
|
+
</define>
|
66
|
+
<define name="DocumentType">
|
67
|
+
<choice>
|
68
|
+
<value>international-standard</value>
|
69
|
+
<value>technical-specification</value>
|
70
|
+
<value>technical-report</value>
|
71
|
+
<value>publicly-available-specification</value>
|
72
|
+
<value>international-workshop-agreement</value>
|
73
|
+
<value>guide</value>
|
74
|
+
<value>recommendation</value>
|
75
|
+
<value>amendment</value>
|
76
|
+
<value>technical-corrigendum</value>
|
77
|
+
<value>directive</value>
|
78
|
+
<value>committee-document</value>
|
79
|
+
<value>addendum</value>
|
80
|
+
</choice>
|
81
|
+
</define>
|
82
|
+
<define name="DocumentSubtype">
|
83
|
+
<choice>
|
84
|
+
<value>specification</value>
|
85
|
+
<value>method-of-test</value>
|
86
|
+
<value>vocabulary</value>
|
87
|
+
<value>code-of-practice</value>
|
88
|
+
</choice>
|
89
|
+
</define>
|
90
|
+
<define name="structuredidentifier">
|
91
|
+
<element name="structuredidentifier">
|
92
|
+
<optional>
|
93
|
+
<attribute name="type"/>
|
94
|
+
</optional>
|
95
|
+
<group>
|
96
|
+
<ref name="documentnumber"/>
|
97
|
+
<optional>
|
98
|
+
<ref name="tc-documentnumber"/>
|
99
|
+
</optional>
|
100
|
+
</group>
|
101
|
+
</element>
|
102
|
+
</define>
|
103
|
+
<define name="editorialgroup">
|
104
|
+
<element name="editorialgroup">
|
105
|
+
<ref name="ISOProjectGroup"/>
|
106
|
+
</element>
|
107
|
+
</define>
|
108
|
+
</include>
|
109
|
+
<define name="updates_document_type">
|
110
|
+
<element name="updates-document-type">
|
111
|
+
<ref name="DocumentType"/>
|
112
|
+
</element>
|
113
|
+
</define>
|
114
|
+
<define name="ISOProjectGroup">
|
115
|
+
<zeroOrMore>
|
116
|
+
<ref name="agency"/>
|
117
|
+
</zeroOrMore>
|
118
|
+
<oneOrMore>
|
119
|
+
<ref name="technical-committee"/>
|
120
|
+
</oneOrMore>
|
121
|
+
<zeroOrMore>
|
122
|
+
<ref name="subcommittee"/>
|
123
|
+
</zeroOrMore>
|
124
|
+
<zeroOrMore>
|
125
|
+
<ref name="workgroup"/>
|
126
|
+
</zeroOrMore>
|
127
|
+
<optional>
|
128
|
+
<ref name="secretariat"/>
|
129
|
+
</optional>
|
130
|
+
</define>
|
131
|
+
<define name="approvalgroup">
|
132
|
+
<element name="approvalgroup">
|
133
|
+
<ref name="ISOProjectGroup"/>
|
134
|
+
</element>
|
135
|
+
</define>
|
136
|
+
<define name="agency">
|
137
|
+
<element name="agency">
|
138
|
+
<text/>
|
139
|
+
</element>
|
140
|
+
</define>
|
141
|
+
<define name="horizontal">
|
142
|
+
<element name="horizontal">
|
143
|
+
<data type="boolean"/>
|
144
|
+
</element>
|
145
|
+
</define>
|
146
|
+
<define name="documentnumber">
|
147
|
+
<element name="project-number">
|
148
|
+
<optional>
|
149
|
+
<attribute name="part">
|
150
|
+
<data type="int"/>
|
151
|
+
</attribute>
|
152
|
+
</optional>
|
153
|
+
<optional>
|
154
|
+
<attribute name="subpart">
|
155
|
+
<data type="int"/>
|
156
|
+
</attribute>
|
157
|
+
</optional>
|
158
|
+
<optional>
|
159
|
+
<attribute name="amendment">
|
160
|
+
<data type="int"/>
|
161
|
+
</attribute>
|
162
|
+
</optional>
|
163
|
+
<optional>
|
164
|
+
<attribute name="corrigendum">
|
165
|
+
<data type="int"/>
|
166
|
+
</attribute>
|
167
|
+
</optional>
|
168
|
+
<optional>
|
169
|
+
<attribute name="origyr">
|
170
|
+
<ref name="ISO8601Date"/>
|
171
|
+
</attribute>
|
172
|
+
</optional>
|
173
|
+
<text/>
|
174
|
+
</element>
|
175
|
+
</define>
|
176
|
+
<define name="tc-documentnumber">
|
177
|
+
<element name="tc-document-number">
|
178
|
+
<data type="int"/>
|
179
|
+
</element>
|
180
|
+
</define>
|
181
|
+
<define name="subcommittee">
|
182
|
+
<element name="subcommittee">
|
183
|
+
<ref name="IsoWorkgroup"/>
|
184
|
+
</element>
|
185
|
+
</define>
|
186
|
+
<define name="workgroup">
|
187
|
+
<element name="workgroup">
|
188
|
+
<ref name="IsoWorkgroup"/>
|
189
|
+
</element>
|
190
|
+
</define>
|
191
|
+
<define name="secretariat">
|
192
|
+
<element name="secretariat">
|
193
|
+
<text/>
|
194
|
+
</element>
|
195
|
+
</define>
|
196
|
+
<define name="stagename">
|
197
|
+
<element name="stagename">
|
198
|
+
<optional>
|
199
|
+
<attribute name="abbreviation"/>
|
200
|
+
</optional>
|
201
|
+
<text/>
|
202
|
+
</element>
|
203
|
+
</define>
|
204
|
+
<define name="fast_track">
|
205
|
+
<element name="fast-track">
|
206
|
+
<data type="boolean"/>
|
207
|
+
</element>
|
208
|
+
</define>
|
209
|
+
<define name="price-code">
|
210
|
+
<element name="price-code">
|
211
|
+
<text/>
|
212
|
+
</element>
|
213
|
+
</define>
|
214
|
+
</grammar>
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require 'relaton_iso/iso_bibliographic_item'
|
4
|
+
# require "relaton_iso/scrapper"
|
5
|
+
# require "relaton_iso/hit_collection"
|
6
|
+
# require "relaton_iec"
|
7
|
+
|
8
|
+
module Relaton
|
9
|
+
module Iso
|
10
|
+
# Methods for search ISO standards.
|
11
|
+
module Bibliography
|
12
|
+
extend self
|
13
|
+
|
14
|
+
# @param text [Pubid::Iso::Identifier, String]
|
15
|
+
# @return [RelatonIso::HitCollection]
|
16
|
+
def search(pubid, opts = {})
|
17
|
+
pubid = ::Pubid::Iso::Identifier.parse(pubid) if pubid.is_a? String
|
18
|
+
HitCollection.new(pubid, opts).find
|
19
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
20
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
21
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT => e
|
22
|
+
raise Relaton::RequestError, e.message
|
23
|
+
end
|
24
|
+
|
25
|
+
# @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
|
26
|
+
# @param year [String, NilClass] the year the standard was published
|
27
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
28
|
+
# @option opts [Boolean] :all_parts if all-parts reference is required
|
29
|
+
# @option opts [Boolean] :keep_year if undated reference should return
|
30
|
+
# actual reference with year
|
31
|
+
#
|
32
|
+
# @return [RelatonIsoBib::IsoBibliographicItem] Bibliographic item
|
33
|
+
def get(ref, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity,Metrics/AbcSize
|
34
|
+
code = ref.gsub("\u2013", "-")
|
35
|
+
|
36
|
+
# parse "all parts" request
|
37
|
+
# code.sub! " (all parts)", ""
|
38
|
+
# opts[:all_parts] ||= $~ && opts[:all_parts].nil?
|
39
|
+
|
40
|
+
query_pubid = ::Pubid::Iso::Identifier.parse(code)
|
41
|
+
query_pubid.root.year = year.to_i if year&.respond_to?(:to_i)
|
42
|
+
query_pubid.root.all_parts ||= opts[:all_parts]
|
43
|
+
Util.info "Fetching from Relaton repository ...", key: query_pubid.to_s
|
44
|
+
|
45
|
+
hits, missed_year_ids = isobib_search_filter(query_pubid, opts)
|
46
|
+
tip_ids = look_up_with_any_types_stages(hits, ref, opts)
|
47
|
+
ret = hits.fetch_doc
|
48
|
+
return fetch_ref_err(query_pubid, missed_year_ids, tip_ids) unless ret
|
49
|
+
|
50
|
+
response_pubid = ret.docidentifier.find(&:primary) # .sub(" (all parts)", "")
|
51
|
+
Util.info "Found: `#{response_pubid}`", key: query_pubid.to_s
|
52
|
+
get_all = (query_pubid.root.year && opts[:keep_year].nil?) || opts[:keep_year] || opts[:all_parts]
|
53
|
+
return ret if get_all
|
54
|
+
|
55
|
+
ret.to_most_recent_reference
|
56
|
+
rescue ::Pubid::Core::Errors::ParseError
|
57
|
+
Util.warn "Is not recognized as a standards identifier.", key: code
|
58
|
+
nil
|
59
|
+
end
|
60
|
+
|
61
|
+
# @param query_pubid [Pubid::Iso::Identifier]
|
62
|
+
# @param pubid [Pubid::Iso::Identifier]
|
63
|
+
# @param all_parts [Boolean] match with any parts when true
|
64
|
+
# @return [Boolean]
|
65
|
+
def matches_parts?(query_pubid, pubid, all_parts: false)
|
66
|
+
# match only with documents with part number
|
67
|
+
return !pubid.part.nil? if all_parts
|
68
|
+
|
69
|
+
query_pubid.part == pubid.part
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Matches base of query_pubid and pubid.
|
74
|
+
#
|
75
|
+
# @param [Pubid::Iso::Identifier] query_pubid pubid to match
|
76
|
+
# @param [Pubid::Iso::Identifier] pubid pubid to match
|
77
|
+
# @param [Boolean] any_types_stages match with any types and stages
|
78
|
+
#
|
79
|
+
# @return [<Type>] <description>
|
80
|
+
#
|
81
|
+
def matches_base?(query_pubid, pubid, any_types_stages: false) # rubocop:disable Metrics?PerceivedComplexity
|
82
|
+
return false unless pubid.respond_to?(:publisher)
|
83
|
+
|
84
|
+
query_pubid.publisher == pubid.publisher &&
|
85
|
+
query_pubid.number == pubid.number &&
|
86
|
+
query_pubid.copublisher == pubid.copublisher &&
|
87
|
+
(any_types_stages || query_pubid.stage == pubid.stage) &&
|
88
|
+
(any_types_stages || query_pubid.is_a?(pubid.class))
|
89
|
+
end
|
90
|
+
|
91
|
+
# @param hit_collection [RelatonIso::HitCollection]
|
92
|
+
# @param year [String]
|
93
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
|
94
|
+
def filter_hits_by_year(hit_collection, year)
|
95
|
+
missed_year_ids = Set.new
|
96
|
+
return [hit_collection, missed_year_ids] if year.nil?
|
97
|
+
|
98
|
+
# filter by year
|
99
|
+
hit_collection.select! do |hit|
|
100
|
+
hit.pubid.year ||= hit.hit[:year]
|
101
|
+
next true if check_year(year, hit)
|
102
|
+
|
103
|
+
missed_year_ids << hit.pubid.to_s if hit.pubid.year
|
104
|
+
false
|
105
|
+
end
|
106
|
+
|
107
|
+
[hit_collection, missed_year_ids]
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def check_year(year, hit) # rubocop:disable Metrics/AbcSize
|
113
|
+
(hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s) ||
|
114
|
+
(!hit.pubid.base.nil? && hit.pubid.base.year.to_s == year.to_s) ||
|
115
|
+
(!hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s)
|
116
|
+
end
|
117
|
+
|
118
|
+
# @param pubid [Pubid::Iso::Identifier] PubID with no results
|
119
|
+
def fetch_ref_err(pubid, missed_year_ids, tip_ids) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
120
|
+
Util.info "Not found.", key: pubid.to_s
|
121
|
+
|
122
|
+
if missed_year_ids.any?
|
123
|
+
ids = missed_year_ids.map { |i| "`#{i}`" }.join(", ")
|
124
|
+
Util.info "TIP: No match for edition year #{pubid.year}, but matches exist for #{ids}.", key: pubid.to_s
|
125
|
+
end
|
126
|
+
|
127
|
+
if tip_ids.any?
|
128
|
+
ids = tip_ids.map { |i| "`#{i}`" }.join(", ")
|
129
|
+
Util.info "TIP: Matches exist for #{ids}.", key: pubid.to_s
|
130
|
+
end
|
131
|
+
|
132
|
+
if pubid.part
|
133
|
+
Util.info "TIP: If it cannot be found, the document may no longer be published in parts.", key: pubid.to_s
|
134
|
+
else
|
135
|
+
Util.info "TIP: If you wish to cite all document parts for the reference, " \
|
136
|
+
"use `#{pubid.to_s(format: :ref_undated)} (all parts)`.", key: pubid.to_s
|
137
|
+
end
|
138
|
+
|
139
|
+
nil
|
140
|
+
end
|
141
|
+
|
142
|
+
def look_up_with_any_types_stages(hits, ref, opts)
|
143
|
+
return [] if hits.any? || !ref.match?(/^ISO[\/\s][A-Z]/)
|
144
|
+
|
145
|
+
ref_no_type_stage = ref.sub(/^ISO[\/\s][A-Z]+/, "ISO")
|
146
|
+
pubid = ::Pubid::Iso::Identifier.parse(ref_no_type_stage)
|
147
|
+
resp, = isobib_search_filter(pubid, opts, any_types_stages: true)
|
148
|
+
resp.map &:pubid
|
149
|
+
end
|
150
|
+
|
151
|
+
#
|
152
|
+
# Search for hits. If no found then trying missed stages.
|
153
|
+
#
|
154
|
+
# @param query_pubid [Pubid::Iso::Identifier] reference without correction
|
155
|
+
# @param opts [Hash]
|
156
|
+
# @param any_types_stages [Boolean] match with any stages
|
157
|
+
#
|
158
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed years
|
159
|
+
#
|
160
|
+
def isobib_search_filter(query_pubid, opts, any_types_stages: false)
|
161
|
+
hit_collection = search(query_pubid, opts)
|
162
|
+
|
163
|
+
# filter only matching hits
|
164
|
+
filter_hits hit_collection, query_pubid, any_types_stages
|
165
|
+
end
|
166
|
+
|
167
|
+
#
|
168
|
+
# Filter hits by query_pubid.
|
169
|
+
#
|
170
|
+
# @param hit_collection [RelatonIso::HitCollection]
|
171
|
+
# @param query_pubid [Pubid::Iso::Identifier]
|
172
|
+
# @param all_parts [Boolean]
|
173
|
+
# @param any_types_stages [Boolean]
|
174
|
+
#
|
175
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
|
176
|
+
#
|
177
|
+
def filter_hits(hit_collection, query_pubid, any_types_stages) # rubocop:disable Metrics/AbcSize
|
178
|
+
# filter out
|
179
|
+
excludings = build_excludings(query_pubid.root.all_parts, any_types_stages)
|
180
|
+
no_year_ref = hit_collection.ref_pubid_no_year.exclude(*excludings)
|
181
|
+
hit_collection.select! do |i|
|
182
|
+
pubid_match?(i.pubid, query_pubid, excludings, no_year_ref) &&
|
183
|
+
!(query_pubid.root.all_parts && i.pubid.part.nil?)
|
184
|
+
end
|
185
|
+
|
186
|
+
filter_hits_by_year(hit_collection, query_pubid.root.year)
|
187
|
+
end
|
188
|
+
|
189
|
+
def build_excludings(all_parts, any_types_stages)
|
190
|
+
excludings = %i[year edition all_parts]
|
191
|
+
excludings += %i[type stage iteration] if any_types_stages
|
192
|
+
excludings << :part if all_parts
|
193
|
+
excludings
|
194
|
+
end
|
195
|
+
|
196
|
+
def pubid_match?(pubid, query_pubid, excludings, no_year_ref)
|
197
|
+
if pubid.is_a? String then pubid == query_pubid.to_s
|
198
|
+
else
|
199
|
+
pubid = pubid.dup
|
200
|
+
pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
|
201
|
+
pubid.exclude(*excludings) == no_year_ref
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
require_relative "../iso"
|
2
|
+
require_relative "queue"
|
3
|
+
require_relative "scraper"
|
4
|
+
|
5
|
+
module Relaton
|
6
|
+
module Iso
|
7
|
+
# Fetch all the documents from ISO website.
|
8
|
+
class DataFetcher < Core::DataFetcher
|
9
|
+
def gh_issue_channel
|
10
|
+
["relaton/relaton-iso", "Error fetching ISO documents"]
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# The queue is used to store the ICS page paths beeing fetching in the current run.
|
15
|
+
#
|
16
|
+
# @return [Queue] queue
|
17
|
+
#
|
18
|
+
def queue
|
19
|
+
@queue ||= ::Queue.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def mutex
|
23
|
+
@mutex ||= Mutex.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def log_error(msg)
|
27
|
+
Util.error msg
|
28
|
+
end
|
29
|
+
|
30
|
+
def index
|
31
|
+
@index ||= Relaton::Index.find_or_create :iso, file: "#{HitCollection::INDEXFILE}.yaml"
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# ISO has too many docs. GHA can't get them all in one run.
|
36
|
+
# So, we need to split the process into several runs.
|
37
|
+
# The iso_queue is used to store the doc paths that have not been fetched.
|
38
|
+
#
|
39
|
+
# @return [Relaton::Iso::Queue] queue
|
40
|
+
#
|
41
|
+
def iso_queue
|
42
|
+
@iso_queue ||= Relaton::Iso::Queue.new
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Go through all ICS and fetch all documents.
|
47
|
+
#
|
48
|
+
# @return [void]
|
49
|
+
#
|
50
|
+
def fetch # rubocop:disable Metrics/AbcSize
|
51
|
+
Util.info "Scrapping ICS pages..."
|
52
|
+
fetch_ics
|
53
|
+
Util.info "(#{Time.now}) Scrapping documents..."
|
54
|
+
fetch_docs
|
55
|
+
iso_queue.save
|
56
|
+
# index.sort! { |a, b| compare_docids a, b }
|
57
|
+
index.save
|
58
|
+
repot_errors
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
#
|
64
|
+
# Fetch ICS page recursively and store all the links to documents in the iso_queue.
|
65
|
+
#
|
66
|
+
# @param [String] path path to ICS page
|
67
|
+
#
|
68
|
+
def fetch_ics
|
69
|
+
threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
|
70
|
+
fetch_ics_page "/standards-catalogue/browse-by-ics.html"
|
71
|
+
sleep(1) until queue.empty?
|
72
|
+
threads.size.times { queue << :END }
|
73
|
+
threads.each(&:join)
|
74
|
+
end
|
75
|
+
|
76
|
+
def fetch_ics_page(path)
|
77
|
+
resp = get_redirection path
|
78
|
+
unless resp
|
79
|
+
Util.error "Failed fetching ICS page #{url(path)}"
|
80
|
+
return
|
81
|
+
end
|
82
|
+
|
83
|
+
page = Nokogiri::HTML(resp.body)
|
84
|
+
parse_doc_links page
|
85
|
+
parse_ics_links page
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_doc_links(page)
|
89
|
+
doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
|
90
|
+
@errors[:doc_links] &&= doc_links.empty?
|
91
|
+
doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_ics_links(page)
|
95
|
+
ics_links = page.xpath("//td[@data-title='ICS']/a")
|
96
|
+
@errors[:ics_links] &&= ics_links.empty?
|
97
|
+
ics_links.each { |item| queue << item[:href] }
|
98
|
+
end
|
99
|
+
|
100
|
+
def url(path)
|
101
|
+
Scraper::DOMAIN + path
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Get the page from the given path. If the page is redirected, get the
|
106
|
+
# page from the new path.
|
107
|
+
#
|
108
|
+
# @param [String] path path to the page
|
109
|
+
#
|
110
|
+
# @return [Net::HTTPOK, nil] HTTP response
|
111
|
+
#
|
112
|
+
def get_redirection(path) # rubocop:disable Metrics/MethodLength
|
113
|
+
try = 0
|
114
|
+
uri = URI url(path)
|
115
|
+
begin
|
116
|
+
get_response uri
|
117
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
|
118
|
+
try += 1
|
119
|
+
retry if check_try try, uri
|
120
|
+
|
121
|
+
Util.warn "Failed fetching #{uri}, #{e.message}"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def get_response(uri)
|
126
|
+
resp = Net::HTTP.get_response(uri)
|
127
|
+
resp.code == "302" ? get_redirection(resp["location"]) : resp
|
128
|
+
end
|
129
|
+
|
130
|
+
def check_try(try, uri)
|
131
|
+
if try < 3
|
132
|
+
Util.warn "Timeout fetching #{uri}, retrying..."
|
133
|
+
sleep 1
|
134
|
+
true
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def fetch_docs
|
139
|
+
threads = Array.new(3) { thread { |path| fetch_doc(path) } }
|
140
|
+
iso_queue[0..10_000].each { |docpath| queue << docpath }
|
141
|
+
threads.size.times { queue << :END }
|
142
|
+
threads.each(&:join)
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# Fetch document from ISO website.
|
147
|
+
#
|
148
|
+
# @param [String] docpath document page path
|
149
|
+
#
|
150
|
+
# @return [void]
|
151
|
+
#
|
152
|
+
def fetch_doc(docpath)
|
153
|
+
doc = Scraper.parse_page docpath, errors: @errors
|
154
|
+
mutex.synchronize { save_doc doc, docpath }
|
155
|
+
rescue StandardError => e
|
156
|
+
Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
|
157
|
+
end
|
158
|
+
|
159
|
+
# def compare_docids(id1, id2)
|
160
|
+
# Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
|
161
|
+
# end
|
162
|
+
|
163
|
+
#
|
164
|
+
# save document to file.
|
165
|
+
#
|
166
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] doc document
|
167
|
+
#
|
168
|
+
# @return [void]
|
169
|
+
#
|
170
|
+
def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
171
|
+
docid = doc.docidentifier.detect(&:primary)
|
172
|
+
file = output_file docid.content.to_s
|
173
|
+
if File.exist?(file)
|
174
|
+
rewrite_with_same_or_newer doc, docid, file, docpath
|
175
|
+
else
|
176
|
+
write_file file, doc, docid
|
177
|
+
end
|
178
|
+
iso_queue.move_last docpath
|
179
|
+
end
|
180
|
+
|
181
|
+
def rewrite_with_same_or_newer(doc, docid, file, docpath)
|
182
|
+
bib = Item.from_yaml File.read(file, encoding: "UTF-8")
|
183
|
+
if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
|
184
|
+
write_file file, doc, docid
|
185
|
+
elsif @files.include?(file) && !edition_greater?(bib, doc)
|
186
|
+
Util.warn "Duplicate file `#{file}` for `#{docid.content}` from #{url(docpath)}"
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def edition_greater?(doc, bib)
|
191
|
+
doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
|
192
|
+
end
|
193
|
+
|
194
|
+
def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
195
|
+
doc.edition&.content == bib.edition&.content &&
|
196
|
+
(doc.status&.substage&.content != "98" || bib.status&.substage&.content == "98")
|
197
|
+
end
|
198
|
+
|
199
|
+
def write_file(file, doc, docid)
|
200
|
+
@files << file
|
201
|
+
index.add_or_update docid.content.to_h, file
|
202
|
+
File.write file, serialize(doc), encoding: "UTF-8"
|
203
|
+
end
|
204
|
+
|
205
|
+
def to_yaml(doc)
|
206
|
+
Item.to_yaml doc
|
207
|
+
end
|
208
|
+
|
209
|
+
def to_xml(doc)
|
210
|
+
Bibdata.to_xml doc
|
211
|
+
end
|
212
|
+
|
213
|
+
#
|
214
|
+
# Create thread worker
|
215
|
+
#
|
216
|
+
# @return [Thread] thread
|
217
|
+
#
|
218
|
+
def thread
|
219
|
+
Thread.new do
|
220
|
+
while (path = queue.pop) != :END
|
221
|
+
yield path
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|