relaton-iso 1.20.0 → 2.0.0.pre.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +1 -1
  3. data/Gemfile +1 -0
  4. data/README.adoc +134 -130
  5. data/bin/console +1 -1
  6. data/grammars/basicdoc.rng +2110 -0
  7. data/grammars/biblio-standoc.rng +287 -0
  8. data/grammars/biblio.rng +2097 -0
  9. data/grammars/relaton-iso-compile.rng +11 -0
  10. data/grammars/relaton-iso.rng +214 -0
  11. data/lib/relaton/iso/bibliography.rb +206 -0
  12. data/lib/relaton/iso/data_fetcher.rb +227 -0
  13. data/lib/relaton/iso/hash_parser_v1.rb +121 -0
  14. data/lib/relaton/iso/hit.rb +62 -0
  15. data/lib/relaton/iso/hit_collection.rb +117 -0
  16. data/lib/relaton/iso/item_data.rb +49 -0
  17. data/lib/relaton/iso/model/bibdata.rb +9 -0
  18. data/lib/relaton/iso/model/bibitem.rb +7 -0
  19. data/lib/relaton/iso/model/contributor.rb +7 -0
  20. data/lib/relaton/iso/model/contributor_info.rb +9 -0
  21. data/lib/relaton/iso/model/docidentifier.rb +128 -0
  22. data/lib/relaton/iso/model/doctype.rb +13 -0
  23. data/lib/relaton/iso/model/ext.rb +47 -0
  24. data/lib/relaton/iso/model/iso_project_group.rb +21 -0
  25. data/lib/relaton/iso/model/item.rb +17 -0
  26. data/lib/relaton/iso/model/item_base.rb +19 -0
  27. data/lib/relaton/iso/model/organization.rb +9 -0
  28. data/lib/relaton/iso/model/project_number.rb +22 -0
  29. data/lib/relaton/iso/model/relation.rb +9 -0
  30. data/lib/relaton/iso/model/stagename.rb +14 -0
  31. data/lib/relaton/iso/model/structured_identifier.rb +31 -0
  32. data/lib/relaton/iso/processor.rb +78 -0
  33. data/lib/relaton/iso/queue.rb +63 -0
  34. data/lib/relaton/iso/scraper.rb +591 -0
  35. data/lib/relaton/iso/util.rb +8 -0
  36. data/lib/relaton/iso/version.rb +7 -0
  37. data/lib/relaton/iso.rb +17 -0
  38. data/relaton_iso.gemspec +9 -7
  39. metadata +76 -46
  40. data/bin/bundle +0 -109
  41. data/bin/byebug +0 -27
  42. data/bin/coderay +0 -27
  43. data/bin/gdb_wrapper +0 -29
  44. data/bin/htmldiff +0 -27
  45. data/bin/httpclient +0 -29
  46. data/bin/ldiff +0 -27
  47. data/bin/nokogiri +0 -27
  48. data/bin/pry +0 -27
  49. data/bin/pubid-nist +0 -27
  50. data/bin/racc +0 -27
  51. data/bin/rackup +0 -29
  52. data/bin/rake +0 -27
  53. data/bin/rubocop +0 -27
  54. data/bin/ruby-parse +0 -27
  55. data/bin/ruby-rewrite +0 -27
  56. data/bin/safe_yaml +0 -29
  57. data/bin/thor +0 -27
  58. data/lib/relaton_iso/data_fetcher.rb +0 -246
  59. data/lib/relaton_iso/document_identifier.rb +0 -46
  60. data/lib/relaton_iso/hash_converter.rb +0 -15
  61. data/lib/relaton_iso/hit.rb +0 -59
  62. data/lib/relaton_iso/hit_collection.rb +0 -100
  63. data/lib/relaton_iso/iso_bibliography.rb +0 -202
  64. data/lib/relaton_iso/processor.rb +0 -67
  65. data/lib/relaton_iso/queue.rb +0 -61
  66. data/lib/relaton_iso/scrapper.rb +0 -553
  67. data/lib/relaton_iso/util.rb +0 -6
  68. data/lib/relaton_iso/version.rb +0 -5
  69. data/lib/relaton_iso.rb +0 -17
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0">
3
+ <include href="basicdoc.rng"/>
4
+ <include href="relaton-iso.rng"/>
5
+ <start>
6
+ <choice>
7
+ <ref name="bibitem"/>
8
+ <ref name="bibdata"/>
9
+ </choice>
10
+ </start>
11
+ </grammar>
@@ -0,0 +1,214 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
3
+ <include href="biblio-standoc.rng">
4
+ <define name="BibDataExtensionType">
5
+ <optional>
6
+ <attribute name="schema-version"/>
7
+ </optional>
8
+ <ref name="doctype"/>
9
+ <optional>
10
+ <ref name="docsubtype"/>
11
+ </optional>
12
+ <ref name="flavor"/>
13
+ <optional>
14
+ <ref name="horizontal"/>
15
+ </optional>
16
+ <ref name="editorialgroup"/>
17
+ <optional>
18
+ <ref name="approvalgroup"/>
19
+ </optional>
20
+ <zeroOrMore>
21
+ <ref name="ics"/>
22
+ </zeroOrMore>
23
+ <ref name="structuredidentifier"/>
24
+ <optional>
25
+ <ref name="stagename"/>
26
+ </optional>
27
+ <optional>
28
+ <ref name="updates_document_type"/>
29
+ </optional>
30
+ <optional>
31
+ <ref name="fast_track"/>
32
+ </optional>
33
+ <optional>
34
+ <ref name="price-code"/>
35
+ </optional>
36
+ </define>
37
+ <define name="bdate">
38
+ <element name="date">
39
+ <attribute name="type">
40
+ <choice>
41
+ <ref name="BibliographicDateType"/>
42
+ <text/>
43
+ </choice>
44
+ </attribute>
45
+ <choice>
46
+ <group>
47
+ <element name="from">
48
+ <ref name="ISO8601Date"/>
49
+ </element>
50
+ <optional>
51
+ <element name="to">
52
+ <ref name="ISO8601Date"/>
53
+ </element>
54
+ </optional>
55
+ </group>
56
+ <element name="on">
57
+ <choice>
58
+ <ref name="ISO8601Date"/>
59
+ <value>--</value>
60
+ <value>–</value>
61
+ </choice>
62
+ </element>
63
+ </choice>
64
+ </element>
65
+ </define>
66
+ <define name="DocumentType">
67
+ <choice>
68
+ <value>international-standard</value>
69
+ <value>technical-specification</value>
70
+ <value>technical-report</value>
71
+ <value>publicly-available-specification</value>
72
+ <value>international-workshop-agreement</value>
73
+ <value>guide</value>
74
+ <value>recommendation</value>
75
+ <value>amendment</value>
76
+ <value>technical-corrigendum</value>
77
+ <value>directive</value>
78
+ <value>committee-document</value>
79
+ <value>addendum</value>
80
+ </choice>
81
+ </define>
82
+ <define name="DocumentSubtype">
83
+ <choice>
84
+ <value>specification</value>
85
+ <value>method-of-test</value>
86
+ <value>vocabulary</value>
87
+ <value>code-of-practice</value>
88
+ </choice>
89
+ </define>
90
+ <define name="structuredidentifier">
91
+ <element name="structuredidentifier">
92
+ <optional>
93
+ <attribute name="type"/>
94
+ </optional>
95
+ <group>
96
+ <ref name="documentnumber"/>
97
+ <optional>
98
+ <ref name="tc-documentnumber"/>
99
+ </optional>
100
+ </group>
101
+ </element>
102
+ </define>
103
+ <define name="editorialgroup">
104
+ <element name="editorialgroup">
105
+ <ref name="ISOProjectGroup"/>
106
+ </element>
107
+ </define>
108
+ </include>
109
+ <define name="updates_document_type">
110
+ <element name="updates-document-type">
111
+ <ref name="DocumentType"/>
112
+ </element>
113
+ </define>
114
+ <define name="ISOProjectGroup">
115
+ <zeroOrMore>
116
+ <ref name="agency"/>
117
+ </zeroOrMore>
118
+ <oneOrMore>
119
+ <ref name="technical-committee"/>
120
+ </oneOrMore>
121
+ <zeroOrMore>
122
+ <ref name="subcommittee"/>
123
+ </zeroOrMore>
124
+ <zeroOrMore>
125
+ <ref name="workgroup"/>
126
+ </zeroOrMore>
127
+ <optional>
128
+ <ref name="secretariat"/>
129
+ </optional>
130
+ </define>
131
+ <define name="approvalgroup">
132
+ <element name="approvalgroup">
133
+ <ref name="ISOProjectGroup"/>
134
+ </element>
135
+ </define>
136
+ <define name="agency">
137
+ <element name="agency">
138
+ <text/>
139
+ </element>
140
+ </define>
141
+ <define name="horizontal">
142
+ <element name="horizontal">
143
+ <data type="boolean"/>
144
+ </element>
145
+ </define>
146
+ <define name="documentnumber">
147
+ <element name="project-number">
148
+ <optional>
149
+ <attribute name="part">
150
+ <data type="int"/>
151
+ </attribute>
152
+ </optional>
153
+ <optional>
154
+ <attribute name="subpart">
155
+ <data type="int"/>
156
+ </attribute>
157
+ </optional>
158
+ <optional>
159
+ <attribute name="amendment">
160
+ <data type="int"/>
161
+ </attribute>
162
+ </optional>
163
+ <optional>
164
+ <attribute name="corrigendum">
165
+ <data type="int"/>
166
+ </attribute>
167
+ </optional>
168
+ <optional>
169
+ <attribute name="origyr">
170
+ <ref name="ISO8601Date"/>
171
+ </attribute>
172
+ </optional>
173
+ <text/>
174
+ </element>
175
+ </define>
176
+ <define name="tc-documentnumber">
177
+ <element name="tc-document-number">
178
+ <data type="int"/>
179
+ </element>
180
+ </define>
181
+ <define name="subcommittee">
182
+ <element name="subcommittee">
183
+ <ref name="IsoWorkgroup"/>
184
+ </element>
185
+ </define>
186
+ <define name="workgroup">
187
+ <element name="workgroup">
188
+ <ref name="IsoWorkgroup"/>
189
+ </element>
190
+ </define>
191
+ <define name="secretariat">
192
+ <element name="secretariat">
193
+ <text/>
194
+ </element>
195
+ </define>
196
+ <define name="stagename">
197
+ <element name="stagename">
198
+ <optional>
199
+ <attribute name="abbreviation"/>
200
+ </optional>
201
+ <text/>
202
+ </element>
203
+ </define>
204
+ <define name="fast_track">
205
+ <element name="fast-track">
206
+ <data type="boolean"/>
207
+ </element>
208
+ </define>
209
+ <define name="price-code">
210
+ <element name="price-code">
211
+ <text/>
212
+ </element>
213
+ </define>
214
+ </grammar>
@@ -0,0 +1,206 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'relaton_iso/iso_bibliographic_item'
4
+ # require "relaton_iso/scrapper"
5
+ # require "relaton_iso/hit_collection"
6
+ # require "relaton_iec"
7
+
8
+ module Relaton
9
+ module Iso
10
+ # Methods for search ISO standards.
11
+ module Bibliography
12
+ extend self
13
+
14
+ # @param text [Pubid::Iso::Identifier, String]
15
+ # @return [RelatonIso::HitCollection]
16
+ def search(pubid, opts = {})
17
+ pubid = ::Pubid::Iso::Identifier.parse(pubid) if pubid.is_a? String
18
+ HitCollection.new(pubid, opts).find
19
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
20
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
21
+ Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT => e
22
+ raise Relaton::RequestError, e.message
23
+ end
24
+
25
+ # @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
26
+ # @param year [String, NilClass] the year the standard was published
27
+ # @param opts [Hash] options; restricted to :all_parts if all-parts
28
+ # @option opts [Boolean] :all_parts if all-parts reference is required
29
+ # @option opts [Boolean] :keep_year if undated reference should return
30
+ # actual reference with year
31
+ #
32
+ # @return [RelatonIsoBib::IsoBibliographicItem] Bibliographic item
33
+ def get(ref, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity,Metrics/AbcSize
34
+ code = ref.gsub("\u2013", "-")
35
+
36
+ # parse "all parts" request
37
+ # code.sub! " (all parts)", ""
38
+ # opts[:all_parts] ||= $~ && opts[:all_parts].nil?
39
+
40
+ query_pubid = ::Pubid::Iso::Identifier.parse(code)
41
+ query_pubid.root.year = year.to_i if year&.respond_to?(:to_i)
42
+ query_pubid.root.all_parts ||= opts[:all_parts]
43
+ Util.info "Fetching from Relaton repository ...", key: query_pubid.to_s
44
+
45
+ hits, missed_year_ids = isobib_search_filter(query_pubid, opts)
46
+ tip_ids = look_up_with_any_types_stages(hits, ref, opts)
47
+ ret = hits.fetch_doc
48
+ return fetch_ref_err(query_pubid, missed_year_ids, tip_ids) unless ret
49
+
50
+ response_pubid = ret.docidentifier.find(&:primary) # .sub(" (all parts)", "")
51
+ Util.info "Found: `#{response_pubid}`", key: query_pubid.to_s
52
+ get_all = (query_pubid.root.year && opts[:keep_year].nil?) || opts[:keep_year] || opts[:all_parts]
53
+ return ret if get_all
54
+
55
+ ret.to_most_recent_reference
56
+ rescue ::Pubid::Core::Errors::ParseError
57
+ Util.warn "Is not recognized as a standards identifier.", key: code
58
+ nil
59
+ end
60
+
61
+ # @param query_pubid [Pubid::Iso::Identifier]
62
+ # @param pubid [Pubid::Iso::Identifier]
63
+ # @param all_parts [Boolean] match with any parts when true
64
+ # @return [Boolean]
65
+ def matches_parts?(query_pubid, pubid, all_parts: false)
66
+ # match only with documents with part number
67
+ return !pubid.part.nil? if all_parts
68
+
69
+ query_pubid.part == pubid.part
70
+ end
71
+
72
+ #
73
+ # Matches base of query_pubid and pubid.
74
+ #
75
+ # @param [Pubid::Iso::Identifier] query_pubid pubid to match
76
+ # @param [Pubid::Iso::Identifier] pubid pubid to match
77
+ # @param [Boolean] any_types_stages match with any types and stages
78
+ #
79
+ # @return [<Type>] <description>
80
+ #
81
+ def matches_base?(query_pubid, pubid, any_types_stages: false) # rubocop:disable Metrics?PerceivedComplexity
82
+ return false unless pubid.respond_to?(:publisher)
83
+
84
+ query_pubid.publisher == pubid.publisher &&
85
+ query_pubid.number == pubid.number &&
86
+ query_pubid.copublisher == pubid.copublisher &&
87
+ (any_types_stages || query_pubid.stage == pubid.stage) &&
88
+ (any_types_stages || query_pubid.is_a?(pubid.class))
89
+ end
90
+
91
+ # @param hit_collection [RelatonIso::HitCollection]
92
+ # @param year [String]
93
+ # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
94
+ def filter_hits_by_year(hit_collection, year)
95
+ missed_year_ids = Set.new
96
+ return [hit_collection, missed_year_ids] if year.nil?
97
+
98
+ # filter by year
99
+ hit_collection.select! do |hit|
100
+ hit.pubid.year ||= hit.hit[:year]
101
+ next true if check_year(year, hit)
102
+
103
+ missed_year_ids << hit.pubid.to_s if hit.pubid.year
104
+ false
105
+ end
106
+
107
+ [hit_collection, missed_year_ids]
108
+ end
109
+
110
+ private
111
+
112
+ def check_year(year, hit) # rubocop:disable Metrics/AbcSize
113
+ (hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s) ||
114
+ (!hit.pubid.base.nil? && hit.pubid.base.year.to_s == year.to_s) ||
115
+ (!hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s)
116
+ end
117
+
118
+ # @param pubid [Pubid::Iso::Identifier] PubID with no results
119
+ def fetch_ref_err(pubid, missed_year_ids, tip_ids) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
120
+ Util.info "Not found.", key: pubid.to_s
121
+
122
+ if missed_year_ids.any?
123
+ ids = missed_year_ids.map { |i| "`#{i}`" }.join(", ")
124
+ Util.info "TIP: No match for edition year #{pubid.year}, but matches exist for #{ids}.", key: pubid.to_s
125
+ end
126
+
127
+ if tip_ids.any?
128
+ ids = tip_ids.map { |i| "`#{i}`" }.join(", ")
129
+ Util.info "TIP: Matches exist for #{ids}.", key: pubid.to_s
130
+ end
131
+
132
+ if pubid.part
133
+ Util.info "TIP: If it cannot be found, the document may no longer be published in parts.", key: pubid.to_s
134
+ else
135
+ Util.info "TIP: If you wish to cite all document parts for the reference, " \
136
+ "use `#{pubid.to_s(format: :ref_undated)} (all parts)`.", key: pubid.to_s
137
+ end
138
+
139
+ nil
140
+ end
141
+
142
+ def look_up_with_any_types_stages(hits, ref, opts)
143
+ return [] if hits.any? || !ref.match?(/^ISO[\/\s][A-Z]/)
144
+
145
+ ref_no_type_stage = ref.sub(/^ISO[\/\s][A-Z]+/, "ISO")
146
+ pubid = ::Pubid::Iso::Identifier.parse(ref_no_type_stage)
147
+ resp, = isobib_search_filter(pubid, opts, any_types_stages: true)
148
+ resp.map &:pubid
149
+ end
150
+
151
+ #
152
+ # Search for hits. If no found then trying missed stages.
153
+ #
154
+ # @param query_pubid [Pubid::Iso::Identifier] reference without correction
155
+ # @param opts [Hash]
156
+ # @param any_types_stages [Boolean] match with any stages
157
+ #
158
+ # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed years
159
+ #
160
+ def isobib_search_filter(query_pubid, opts, any_types_stages: false)
161
+ hit_collection = search(query_pubid, opts)
162
+
163
+ # filter only matching hits
164
+ filter_hits hit_collection, query_pubid, any_types_stages
165
+ end
166
+
167
+ #
168
+ # Filter hits by query_pubid.
169
+ #
170
+ # @param hit_collection [RelatonIso::HitCollection]
171
+ # @param query_pubid [Pubid::Iso::Identifier]
172
+ # @param all_parts [Boolean]
173
+ # @param any_types_stages [Boolean]
174
+ #
175
+ # @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
176
+ #
177
+ def filter_hits(hit_collection, query_pubid, any_types_stages) # rubocop:disable Metrics/AbcSize
178
+ # filter out
179
+ excludings = build_excludings(query_pubid.root.all_parts, any_types_stages)
180
+ no_year_ref = hit_collection.ref_pubid_no_year.exclude(*excludings)
181
+ hit_collection.select! do |i|
182
+ pubid_match?(i.pubid, query_pubid, excludings, no_year_ref) &&
183
+ !(query_pubid.root.all_parts && i.pubid.part.nil?)
184
+ end
185
+
186
+ filter_hits_by_year(hit_collection, query_pubid.root.year)
187
+ end
188
+
189
+ def build_excludings(all_parts, any_types_stages)
190
+ excludings = %i[year edition all_parts]
191
+ excludings += %i[type stage iteration] if any_types_stages
192
+ excludings << :part if all_parts
193
+ excludings
194
+ end
195
+
196
+ def pubid_match?(pubid, query_pubid, excludings, no_year_ref)
197
+ if pubid.is_a? String then pubid == query_pubid.to_s
198
+ else
199
+ pubid = pubid.dup
200
+ pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
201
+ pubid.exclude(*excludings) == no_year_ref
202
+ end
203
+ end
204
+ end
205
+ end
206
+ end
@@ -0,0 +1,227 @@
1
+ require_relative "../iso"
2
+ require_relative "queue"
3
+ require_relative "scraper"
4
+
5
+ module Relaton
6
+ module Iso
7
+ # Fetch all the documents from ISO website.
8
+ class DataFetcher < Core::DataFetcher
9
+ def gh_issue_channel
10
+ ["relaton/relaton-iso", "Error fetching ISO documents"]
11
+ end
12
+
13
+ #
14
+ # The queue is used to store the ICS page paths beeing fetching in the current run.
15
+ #
16
+ # @return [Queue] queue
17
+ #
18
+ def queue
19
+ @queue ||= ::Queue.new
20
+ end
21
+
22
+ def mutex
23
+ @mutex ||= Mutex.new
24
+ end
25
+
26
+ def log_error(msg)
27
+ Util.error msg
28
+ end
29
+
30
+ def index
31
+ @index ||= Relaton::Index.find_or_create :iso, file: "#{HitCollection::INDEXFILE}.yaml"
32
+ end
33
+
34
+ #
35
+ # ISO has too many docs. GHA can't get them all in one run.
36
+ # So, we need to split the process into several runs.
37
+ # The iso_queue is used to store the doc paths that have not been fetched.
38
+ #
39
+ # @return [Relaton::Iso::Queue] queue
40
+ #
41
+ def iso_queue
42
+ @iso_queue ||= Relaton::Iso::Queue.new
43
+ end
44
+
45
+ #
46
+ # Go through all ICS and fetch all documents.
47
+ #
48
+ # @return [void]
49
+ #
50
+ def fetch # rubocop:disable Metrics/AbcSize
51
+ Util.info "Scrapping ICS pages..."
52
+ fetch_ics
53
+ Util.info "(#{Time.now}) Scrapping documents..."
54
+ fetch_docs
55
+ iso_queue.save
56
+ # index.sort! { |a, b| compare_docids a, b }
57
+ index.save
58
+ repot_errors
59
+ end
60
+
61
+ private
62
+
63
+ #
64
+ # Fetch ICS page recursively and store all the links to documents in the iso_queue.
65
+ #
66
+ # @param [String] path path to ICS page
67
+ #
68
+ def fetch_ics
69
+ threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
70
+ fetch_ics_page "/standards-catalogue/browse-by-ics.html"
71
+ sleep(1) until queue.empty?
72
+ threads.size.times { queue << :END }
73
+ threads.each(&:join)
74
+ end
75
+
76
+ def fetch_ics_page(path)
77
+ resp = get_redirection path
78
+ unless resp
79
+ Util.error "Failed fetching ICS page #{url(path)}"
80
+ return
81
+ end
82
+
83
+ page = Nokogiri::HTML(resp.body)
84
+ parse_doc_links page
85
+ parse_ics_links page
86
+ end
87
+
88
+ def parse_doc_links(page)
89
+ doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
90
+ @errors[:doc_links] &&= doc_links.empty?
91
+ doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
92
+ end
93
+
94
+ def parse_ics_links(page)
95
+ ics_links = page.xpath("//td[@data-title='ICS']/a")
96
+ @errors[:ics_links] &&= ics_links.empty?
97
+ ics_links.each { |item| queue << item[:href] }
98
+ end
99
+
100
+ def url(path)
101
+ Scraper::DOMAIN + path
102
+ end
103
+
104
+ #
105
+ # Get the page from the given path. If the page is redirected, get the
106
+ # page from the new path.
107
+ #
108
+ # @param [String] path path to the page
109
+ #
110
+ # @return [Net::HTTPOK, nil] HTTP response
111
+ #
112
+ def get_redirection(path) # rubocop:disable Metrics/MethodLength
113
+ try = 0
114
+ uri = URI url(path)
115
+ begin
116
+ get_response uri
117
+ rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
118
+ try += 1
119
+ retry if check_try try, uri
120
+
121
+ Util.warn "Failed fetching #{uri}, #{e.message}"
122
+ end
123
+ end
124
+
125
+ def get_response(uri)
126
+ resp = Net::HTTP.get_response(uri)
127
+ resp.code == "302" ? get_redirection(resp["location"]) : resp
128
+ end
129
+
130
+ def check_try(try, uri)
131
+ if try < 3
132
+ Util.warn "Timeout fetching #{uri}, retrying..."
133
+ sleep 1
134
+ true
135
+ end
136
+ end
137
+
138
+ def fetch_docs
139
+ threads = Array.new(3) { thread { |path| fetch_doc(path) } }
140
+ iso_queue[0..10_000].each { |docpath| queue << docpath }
141
+ threads.size.times { queue << :END }
142
+ threads.each(&:join)
143
+ end
144
+
145
+ #
146
+ # Fetch document from ISO website.
147
+ #
148
+ # @param [String] docpath document page path
149
+ #
150
+ # @return [void]
151
+ #
152
+ def fetch_doc(docpath)
153
+ doc = Scraper.parse_page docpath, errors: @errors
154
+ mutex.synchronize { save_doc doc, docpath }
155
+ rescue StandardError => e
156
+ Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
157
+ end
158
+
159
+ # def compare_docids(id1, id2)
160
+ # Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
161
+ # end
162
+
163
+ #
164
+ # save document to file.
165
+ #
166
+ # @param [RelatonIsoBib::IsoBibliographicItem] doc document
167
+ #
168
+ # @return [void]
169
+ #
170
+ def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
171
+ docid = doc.docidentifier.detect(&:primary)
172
+ file = output_file docid.content.to_s
173
+ if File.exist?(file)
174
+ rewrite_with_same_or_newer doc, docid, file, docpath
175
+ else
176
+ write_file file, doc, docid
177
+ end
178
+ iso_queue.move_last docpath
179
+ end
180
+
181
+ def rewrite_with_same_or_newer(doc, docid, file, docpath)
182
+ bib = Item.from_yaml File.read(file, encoding: "UTF-8")
183
+ if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
184
+ write_file file, doc, docid
185
+ elsif @files.include?(file) && !edition_greater?(bib, doc)
186
+ Util.warn "Duplicate file `#{file}` for `#{docid.content}` from #{url(docpath)}"
187
+ end
188
+ end
189
+
190
+ def edition_greater?(doc, bib)
191
+ doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
192
+ end
193
+
194
+ def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
195
+ doc.edition&.content == bib.edition&.content &&
196
+ (doc.status&.substage&.content != "98" || bib.status&.substage&.content == "98")
197
+ end
198
+
199
+ def write_file(file, doc, docid)
200
+ @files << file
201
+ index.add_or_update docid.content.to_h, file
202
+ File.write file, serialize(doc), encoding: "UTF-8"
203
+ end
204
+
205
+ def to_yaml(doc)
206
+ Item.to_yaml doc
207
+ end
208
+
209
+ def to_xml(doc)
210
+ Bibdata.to_xml doc
211
+ end
212
+
213
+ #
214
+ # Create thread worker
215
+ #
216
+ # @return [Thread] thread
217
+ #
218
+ def thread
219
+ Thread.new do
220
+ while (path = queue.pop) != :END
221
+ yield path
222
+ end
223
+ end
224
+ end
225
+ end
226
+ end
227
+ end