relaton-iso 1.18.1 → 1.18.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_iso/data_fetcher.rb +200 -0
- data/lib/relaton_iso/document_identifier.rb +20 -1
- data/lib/relaton_iso/hash_converter.rb +15 -0
- data/lib/relaton_iso/hit.rb +29 -21
- data/lib/relaton_iso/hit_collection.rb +74 -60
- data/lib/relaton_iso/index.rb +132 -0
- data/lib/relaton_iso/iso_bibliography.rb +171 -180
- data/lib/relaton_iso/processor.rb +22 -2
- data/lib/relaton_iso/queue.rb +61 -0
- data/lib/relaton_iso/scrapper.rb +118 -70
- data/lib/relaton_iso/version.rb +1 -1
- data/lib/relaton_iso.rb +5 -0
- data/relaton_iso.gemspec +1 -1
- metadata +13 -9
@@ -6,205 +6,196 @@ require "relaton_iso/hit_collection"
|
|
6
6
|
# require "relaton_iec"
|
7
7
|
|
8
8
|
module RelatonIso
|
9
|
-
#
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
9
|
+
# Methods for search ISO standards.
|
10
|
+
module IsoBibliography
|
11
|
+
extend self
|
12
|
+
|
13
|
+
# @param text [Pubid::Iso::Identifier, String]
|
14
|
+
# @return [RelatonIso::HitCollection]
|
15
|
+
def search(pubid, opts = {})
|
16
|
+
pubid = Pubid::Iso::Identifier.parse(pubid) if pubid.is_a? String
|
17
|
+
HitCollection.new(pubid, opts).fetch
|
18
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
19
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
20
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT => e
|
21
|
+
raise RelatonBib::RequestError, e.message
|
22
|
+
end
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
Util.warn "(#{query_pubid}) Found: `#{response_pubid}`"
|
58
|
-
|
59
|
-
get_all = (
|
60
|
-
(query_pubid.year && opts[:keep_year].nil?) ||
|
61
|
-
opts[:keep_year] ||
|
62
|
-
opts[:all_parts]
|
63
|
-
)
|
64
|
-
return ret if get_all
|
65
|
-
|
66
|
-
ret.to_most_recent_reference
|
67
|
-
rescue Pubid::Core::Errors::ParseError
|
68
|
-
Util.warn "(#{code}) Is not recognized as a standards identifier."
|
69
|
-
nil
|
70
|
-
end
|
24
|
+
# @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
|
25
|
+
# @param year [String, NilClass] the year the standard was published
|
26
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
27
|
+
# @option opts [Boolean] :all_parts if all-parts reference is required
|
28
|
+
# @option opts [Boolean] :keep_year if undated reference should return
|
29
|
+
# actual reference with year
|
30
|
+
#
|
31
|
+
# @return [RelatonIsoBib::IsoBibliographicItem] Bibliographic item
|
32
|
+
def get(ref, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity,Metrics/AbcSize
|
33
|
+
code = ref.gsub("\u2013", "-")
|
34
|
+
|
35
|
+
# parse "all parts" request
|
36
|
+
code.sub! " (all parts)", ""
|
37
|
+
opts[:all_parts] ||= $~ && opts[:all_parts].nil?
|
38
|
+
|
39
|
+
query_pubid = Pubid::Iso::Identifier.parse(code)
|
40
|
+
query_pubid.root.year = year.to_i if year&.respond_to?(:to_i)
|
41
|
+
Util.warn "(#{query_pubid}) Fetching from Relaton repository ..."
|
42
|
+
|
43
|
+
hits, missed_year_ids = isobib_search_filter(query_pubid, opts)
|
44
|
+
tip_ids = look_up_with_any_types_stages(hits, ref, opts)
|
45
|
+
ret = hits.fetch_doc
|
46
|
+
return fetch_ref_err(query_pubid, missed_year_ids, tip_ids) unless ret
|
47
|
+
|
48
|
+
response_pubid = ret.docidentifier.first.id # .sub(" (all parts)", "")
|
49
|
+
Util.warn "(#{query_pubid}) Found: `#{response_pubid}`"
|
50
|
+
get_all = (query_pubid.root.year && opts[:keep_year].nil?) || opts[:keep_year] || opts[:all_parts]
|
51
|
+
return ret if get_all
|
52
|
+
|
53
|
+
ret.to_most_recent_reference
|
54
|
+
rescue Pubid::Core::Errors::ParseError
|
55
|
+
Util.warn "(#{code}) Is not recognized as a standards identifier."
|
56
|
+
nil
|
57
|
+
end
|
71
58
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
59
|
+
# @param query_pubid [Pubid::Iso::Identifier]
|
60
|
+
# @param pubid [Pubid::Iso::Identifier]
|
61
|
+
# @param all_parts [Boolean] match with any parts when true
|
62
|
+
# @return [Boolean]
|
63
|
+
def matches_parts?(query_pubid, pubid, all_parts: false)
|
64
|
+
# match only with documents with part number
|
65
|
+
return !pubid.part.nil? if all_parts
|
79
66
|
|
80
|
-
|
81
|
-
|
67
|
+
query_pubid.part == pubid.part
|
68
|
+
end
|
82
69
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
70
|
+
#
|
71
|
+
# Matches base of query_pubid and pubid.
|
72
|
+
#
|
73
|
+
# @param [Pubid::Iso::Identifier] query_pubid pubid to match
|
74
|
+
# @param [Pubid::Iso::Identifier] pubid pubid to match
|
75
|
+
# @param [Boolean] any_types_stages match with any types and stages
|
76
|
+
#
|
77
|
+
# @return [<Type>] <description>
|
78
|
+
#
|
79
|
+
def matches_base?(query_pubid, pubid, any_types_stages: false) # rubocop:disable Metrics?PerceivedComplexity
|
80
|
+
return false unless pubid.respond_to?(:publisher)
|
81
|
+
|
82
|
+
query_pubid.publisher == pubid.publisher &&
|
83
|
+
query_pubid.number == pubid.number &&
|
84
|
+
query_pubid.copublisher == pubid.copublisher &&
|
85
|
+
(any_types_stages || query_pubid.stage == pubid.stage) &&
|
86
|
+
(any_types_stages || query_pubid.is_a?(pubid.class))
|
87
|
+
end
|
88
|
+
|
89
|
+
# @param hit_collection [RelatonIso::HitCollection]
|
90
|
+
# @param year [String]
|
91
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
|
92
|
+
def filter_hits_by_year(hit_collection, year)
|
93
|
+
missed_year_ids = Set.new
|
94
|
+
return [hit_collection, missed_year_ids] if year.nil?
|
95
|
+
|
96
|
+
# filter by year
|
97
|
+
hits = hit_collection.select do |hit|
|
98
|
+
hit.pubid.year ||= hit.hit[:year]
|
99
|
+
next true if check_year(year, hit)
|
100
|
+
|
101
|
+
missed_year_ids << hit.pubid.to_s if hit.pubid.year
|
102
|
+
false
|
100
103
|
end
|
101
104
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
missed_year_ids = Set.new
|
107
|
-
return [hit_collection, missed_year_ids] if year.nil?
|
105
|
+
[hits, missed_year_ids]
|
106
|
+
end
|
107
|
+
|
108
|
+
private
|
108
109
|
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
110
|
+
def check_year(year, hit) # rubocop:disable Metrics/AbcSize
|
111
|
+
(hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s) ||
|
112
|
+
(!hit.pubid.base.nil? && hit.pubid.base.year.to_s == year.to_s) ||
|
113
|
+
(!hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s)
|
114
|
+
end
|
113
115
|
|
114
|
-
|
115
|
-
|
116
|
-
|
116
|
+
# @param pubid [Pubid::Iso::Identifier] PubID with no results
|
117
|
+
def fetch_ref_err(pubid, missed_year_ids, tip_ids) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
118
|
+
Util.warn "(#{pubid}) Not found."
|
117
119
|
|
118
|
-
|
120
|
+
if missed_year_ids.any?
|
121
|
+
ids = missed_year_ids.map { |i| "`#{i}`" }.join(", ")
|
122
|
+
Util.warn "(#{pubid}) TIP: No match for edition year #{pubid.year}, but matches exist for #{ids}."
|
119
123
|
end
|
120
124
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
(hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s) ||
|
125
|
-
(!hit.pubid.base.nil? && hit.pubid.base.year.to_s == year.to_s) ||
|
126
|
-
(!hit.pubid.base.nil? && hit.pubid.year.to_s == year.to_s)
|
125
|
+
if tip_ids.any?
|
126
|
+
ids = tip_ids.map { |i| "`#{i}`" }.join(", ")
|
127
|
+
Util.warn "(#{pubid}) TIP: Matches exist for #{ids}."
|
127
128
|
end
|
128
129
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
ids = missed_year_ids.map { |i| "`#{i}`" }.join(", ")
|
135
|
-
Util.warn "(#{pubid}) TIP: No match for edition year " \
|
136
|
-
"#{pubid.year}, but matches exist for #{ids}."
|
137
|
-
end
|
138
|
-
|
139
|
-
if tip_ids.any?
|
140
|
-
ids = tip_ids.map { |i| "`#{i}`" }.join(", ")
|
141
|
-
Util.warn "(#{pubid}) TIP: Matches exist for #{ids}."
|
142
|
-
end
|
143
|
-
|
144
|
-
if pubid.part
|
145
|
-
Util.warn "(#{pubid}) TIP: If it cannot be found, " \
|
146
|
-
"the document may no longer be published in parts."
|
147
|
-
else
|
148
|
-
Util.warn "(#{pubid}) TIP: If you wish to cite " \
|
149
|
-
"all document parts for the reference, use " \
|
150
|
-
"`#{pubid.to_s(format: :ref_undated)} (all parts)`."
|
151
|
-
end
|
152
|
-
|
153
|
-
nil
|
130
|
+
if pubid.part
|
131
|
+
Util.warn "(#{pubid}) TIP: If it cannot be found, the document may no longer be published in parts."
|
132
|
+
else
|
133
|
+
Util.warn "(#{pubid}) TIP: If you wish to cite all document parts for the reference, " \
|
134
|
+
"use `#{pubid.to_s(format: :ref_undated)} (all parts)`."
|
154
135
|
end
|
155
136
|
|
156
|
-
|
157
|
-
|
158
|
-
|
137
|
+
nil
|
138
|
+
end
|
139
|
+
|
140
|
+
def look_up_with_any_types_stages(hits, ref, opts)
|
141
|
+
return [] if hits.any? || !ref.match?(/^ISO[\/\s][A-Z]/)
|
159
142
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
143
|
+
ref_no_type_stage = ref.sub(/^ISO[\/\s][A-Z]+/, "ISO")
|
144
|
+
pubid = Pubid::Iso::Identifier.parse(ref_no_type_stage)
|
145
|
+
resp, = isobib_search_filter(pubid, opts, any_types_stages: true)
|
146
|
+
resp.map &:pubid
|
147
|
+
end
|
165
148
|
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
149
|
+
#
|
150
|
+
# Search for hits. If no found then trying missed stages.
|
151
|
+
#
|
152
|
+
# @param query_pubid [Pubid::Iso::Identifier] reference without correction
|
153
|
+
# @param opts [Hash]
|
154
|
+
# @param any_types_stages [Boolean] match with any stages
|
155
|
+
#
|
156
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed years
|
157
|
+
#
|
158
|
+
def isobib_search_filter(query_pubid, opts, any_types_stages: false)
|
159
|
+
hit_collection = search(query_pubid, opts)
|
160
|
+
|
161
|
+
# filter only matching hits
|
162
|
+
filter_hits hit_collection, query_pubid, opts[:all_parts], any_types_stages
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Filter hits by query_pubid.
|
167
|
+
#
|
168
|
+
# @param hit_collection [RelatonIso::HitCollection]
|
169
|
+
# @param query_pubid [Pubid::Iso::Identifier]
|
170
|
+
# @param all_parts [Boolean]
|
171
|
+
# @param any_types_stages [Boolean]
|
172
|
+
#
|
173
|
+
# @return [Array<RelatonIso::HitCollection, Array<String>>] hits and missed year IDs
|
174
|
+
#
|
175
|
+
def filter_hits(hit_collection, query_pubid, all_parts, any_types_stages)
|
176
|
+
# filter out
|
177
|
+
excludings = build_excludings(all_parts, any_types_stages)
|
178
|
+
no_year_ref = hit_collection.ref_pubid_no_year.exclude(*excludings)
|
179
|
+
result = hit_collection.select do |i|
|
180
|
+
pubid_match?(i.pubid, query_pubid, excludings, no_year_ref) && !(all_parts && i.pubid.part.nil?)
|
184
181
|
end
|
185
182
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
matches_parts?(query_pubid, hit_pubid, all_parts: all_parts) &&
|
203
|
-
query_pubid.corrigendums == hit_pubid.corrigendums &&
|
204
|
-
query_pubid.amendments == hit_pubid.amendments
|
205
|
-
end
|
206
|
-
|
207
|
-
filter_hits_by_year(result, query_pubid.year)
|
183
|
+
filter_hits_by_year(result, query_pubid.year)
|
184
|
+
end
|
185
|
+
|
186
|
+
def build_excludings(all_parts, any_types_stages)
|
187
|
+
excludings = %i[year edition]
|
188
|
+
excludings += %i[type stage iteration] if any_types_stages
|
189
|
+
excludings << :part if all_parts
|
190
|
+
excludings
|
191
|
+
end
|
192
|
+
|
193
|
+
def pubid_match?(pubid, query_pubid, excludings, no_year_ref)
|
194
|
+
if pubid.is_a? String then pubid == query_pubid.to_s
|
195
|
+
else
|
196
|
+
pubid = pubid.dup
|
197
|
+
pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
|
198
|
+
pubid.exclude(*excludings) == no_year_ref
|
208
199
|
end
|
209
200
|
end
|
210
201
|
end
|
@@ -9,16 +9,29 @@ module RelatonIso
|
|
9
9
|
@prefix = "ISO"
|
10
10
|
@defaultprefix = %r{^ISO(/IEC)?\s}
|
11
11
|
@idtype = "ISO"
|
12
|
+
@datasets = %w[iso-ics]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
15
|
-
# @param date [String,
|
16
|
+
# @param date [String, nil] year
|
16
17
|
# @param opts [Hash]
|
17
18
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
18
19
|
def get(code, date, opts)
|
19
20
|
::RelatonIso::IsoBibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the documents from https://www.iso.org/standards-catalogue/browse-by-ics.html
|
25
|
+
#
|
26
|
+
# @param [String] source source name (iso-rss, iso-rss-all)
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format output format (xml, yaml, bibxml)
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -28,7 +41,7 @@ module RelatonIso
|
|
28
41
|
# @param hash [Hash]
|
29
42
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
30
43
|
def hash_to_bib(hash)
|
31
|
-
item_hash =
|
44
|
+
item_hash = HashConverter.hash_to_bib(hash)
|
32
45
|
::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
|
33
46
|
end
|
34
47
|
|
@@ -43,5 +56,12 @@ module RelatonIso
|
|
43
56
|
def threads
|
44
57
|
3
|
45
58
|
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Remove index file
|
62
|
+
#
|
63
|
+
def remove_index_file
|
64
|
+
Relaton::Index.find_or_create(:iso, url: true, file: HitCollection::INDEXFILE).remove_file
|
65
|
+
end
|
46
66
|
end
|
47
67
|
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
#
|
3
|
+
# Queue of links to fetch.
|
4
|
+
#
|
5
|
+
class Queue
|
6
|
+
extend Forwardable
|
7
|
+
def_delegator :queue, :[]
|
8
|
+
|
9
|
+
FILE = "iso-queue.txt".freeze
|
10
|
+
|
11
|
+
#
|
12
|
+
# Open queue file if exist. If not, create new empty queue.
|
13
|
+
#
|
14
|
+
# @return [Array<String>] queue
|
15
|
+
#
|
16
|
+
def queue
|
17
|
+
@queue ||= File.exist?(FILE) ? File.read(FILE).split("\n") : []
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Add item to queue at first position if it is not already there.
|
22
|
+
#
|
23
|
+
# @param [String] item item to add
|
24
|
+
#
|
25
|
+
# @return [void]
|
26
|
+
#
|
27
|
+
def add_first(item)
|
28
|
+
queue.unshift item unless queue.include? item
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Move or add item to the end of the queue.
|
33
|
+
#
|
34
|
+
# @param [String] item item to move or add
|
35
|
+
#
|
36
|
+
# @return [void]
|
37
|
+
#
|
38
|
+
def move_last(item)
|
39
|
+
queue.delete item
|
40
|
+
queue << item
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Take first item from the queue.
|
45
|
+
#
|
46
|
+
# @return [String] an item
|
47
|
+
#
|
48
|
+
# def take_first
|
49
|
+
# queue.shift
|
50
|
+
# end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Save queue to file.
|
54
|
+
#
|
55
|
+
# @return [void]
|
56
|
+
#
|
57
|
+
def save
|
58
|
+
File.write FILE, queue.to_a.join("\n")
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|