relaton-iso 1.18.1 → 1.18.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_iso/data_fetcher.rb +200 -0
- data/lib/relaton_iso/document_identifier.rb +20 -1
- data/lib/relaton_iso/hash_converter.rb +15 -0
- data/lib/relaton_iso/hit.rb +29 -21
- data/lib/relaton_iso/hit_collection.rb +74 -59
- data/lib/relaton_iso/index.rb +132 -0
- data/lib/relaton_iso/iso_bibliography.rb +172 -180
- data/lib/relaton_iso/processor.rb +22 -2
- data/lib/relaton_iso/queue.rb +61 -0
- data/lib/relaton_iso/scrapper.rb +118 -70
- data/lib/relaton_iso/version.rb +1 -1
- data/lib/relaton_iso.rb +5 -0
- data/relaton_iso.gemspec +1 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 395d9fbaf99042f03785dd5b529d1a1821b8e5939a378503b6af4898901363d5
|
4
|
+
data.tar.gz: 4745ddb2a95d4b5dfc2290afdc2116c0ed9f113cda3c717909ca9987eec33486
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb0e270a99fc7a4a8cd07bf0f28d7c5d157976ffaf436523dd9ff871950726af552447f4e4eea1802eb0a59856702fd9ca95757b1b7fea3a89f69e5f83d3f876
|
7
|
+
data.tar.gz: 35d93ed7fdead4846059a22485ff56652b727640d754a2e4523350706966107418c3fb723eb278abaf17460f80612394dc61bf9720907294d821cacbc84e6b7f
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
# Fetch all the documents from ISO website.
|
3
|
+
class DataFetcher
|
4
|
+
#
|
5
|
+
# Initialize data fetcher.
|
6
|
+
#
|
7
|
+
# @param [String] output output directory
|
8
|
+
# @param [String] format format of output files (yaml, bibxml, xml)
|
9
|
+
#
|
10
|
+
def initialize(output, format)
|
11
|
+
@output = output
|
12
|
+
@format = format
|
13
|
+
@ext = format.sub(/^bib/, "")
|
14
|
+
@files = []
|
15
|
+
@queue = ::Queue.new
|
16
|
+
@mutex = Mutex.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def index
|
20
|
+
@index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
|
21
|
+
end
|
22
|
+
|
23
|
+
def iso_queue
|
24
|
+
@iso_queue ||= RelatonIso::Queue.new
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Initialize data fetcher and fetch data.
|
29
|
+
#
|
30
|
+
# @param [String] output output directory (default: "data")
|
31
|
+
# @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
|
32
|
+
#
|
33
|
+
# @return [void]
|
34
|
+
#
|
35
|
+
def self.fetch(output: "data", format: "yaml")
|
36
|
+
t1 = Time.now
|
37
|
+
puts "Started at: #{t1}"
|
38
|
+
FileUtils.mkdir_p output
|
39
|
+
new(output, format).fetch
|
40
|
+
t2 = Time.now
|
41
|
+
puts "Stopped at: #{t2}"
|
42
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Go through all ICS and fetch all documents.
|
47
|
+
#
|
48
|
+
# @return [void]
|
49
|
+
#
|
50
|
+
def fetch # rubocop:disable Metrics/AbcSize
|
51
|
+
puts "Scrapping ICS pages..."
|
52
|
+
fetch_ics
|
53
|
+
puts "[#{Time.now}] Scrapping documents..."
|
54
|
+
fetch_docs
|
55
|
+
iso_queue.save
|
56
|
+
# index.sort! { |a, b| compare_docids a, b }
|
57
|
+
index.save
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Fetch ICS page recursively and store all the links to documents in the iso_queue.
|
62
|
+
#
|
63
|
+
# @param [String] path path to ICS page
|
64
|
+
#
|
65
|
+
def fetch_ics
|
66
|
+
threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
|
67
|
+
fetch_ics_page "/standards-catalogue/browse-by-ics.html"
|
68
|
+
sleep(1) until @queue.empty?
|
69
|
+
threads.size.times { @queue << :END }
|
70
|
+
threads.each(&:join)
|
71
|
+
end
|
72
|
+
|
73
|
+
def fetch_ics_page(path)
|
74
|
+
resp = get_redirection path
|
75
|
+
page = Nokogiri::HTML(resp.body)
|
76
|
+
page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
|
77
|
+
iso_queue.add_first item[:href].split("?").first
|
78
|
+
end
|
79
|
+
|
80
|
+
page.xpath("//td[@data-title='ICS']/a").each do |item|
|
81
|
+
@queue << item[:href]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Get the page from the given path. If the page is redirected, get the
|
87
|
+
# page from the new path.
|
88
|
+
#
|
89
|
+
# @param [String] path path to the page
|
90
|
+
#
|
91
|
+
# @return [Net::HTTPOK] HTTP response
|
92
|
+
#
|
93
|
+
def get_redirection(path) # rubocop:disable Metrics/MethodLength
|
94
|
+
try = 0
|
95
|
+
uri = URI(Scrapper::DOMAIN + path)
|
96
|
+
begin
|
97
|
+
get_response uri
|
98
|
+
rescue Net::OpenTimeout, Net::ReadTimeout => e
|
99
|
+
try += 1
|
100
|
+
retry if check_try try, uri
|
101
|
+
|
102
|
+
warn "Error fetching #{uri}"
|
103
|
+
warn e.message
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_response(uri)
|
108
|
+
resp = Net::HTTP.get_response(uri)
|
109
|
+
resp.code == "302" ? get_redirection(resp["location"]) : resp
|
110
|
+
end
|
111
|
+
|
112
|
+
def check_try(try, uri)
|
113
|
+
if try < 3
|
114
|
+
warn "Timeout fetching #{uri}, retrying..."
|
115
|
+
sleep 1
|
116
|
+
true
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def fetch_docs
|
121
|
+
threads = Array.new(3) { thread { |path| fetch_doc(path) } }
|
122
|
+
iso_queue[0..10_000].each { |docpath| @queue << docpath }
|
123
|
+
threads.size.times { @queue << :END }
|
124
|
+
threads.each(&:join)
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Fetch document from ISO website.
|
129
|
+
#
|
130
|
+
# @param [String] docpath document page path
|
131
|
+
#
|
132
|
+
# @return [void]
|
133
|
+
#
|
134
|
+
def fetch_doc(docpath)
|
135
|
+
# path = docpath.sub(/\.html$/, "")
|
136
|
+
# hit = Hit.new({ path: docpath }, nil)
|
137
|
+
doc = Scrapper.parse_page docpath
|
138
|
+
@mutex.synchronize { save_doc doc, docpath }
|
139
|
+
rescue StandardError => e
|
140
|
+
warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
|
141
|
+
warn e.message
|
142
|
+
warn e.backtrace
|
143
|
+
end
|
144
|
+
|
145
|
+
# def compare_docids(id1, id2)
|
146
|
+
# Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
|
147
|
+
# end
|
148
|
+
|
149
|
+
#
|
150
|
+
# save document to file.
|
151
|
+
#
|
152
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] doc document
|
153
|
+
#
|
154
|
+
# @return [void]
|
155
|
+
#
|
156
|
+
def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
|
+
docid = doc.docidentifier.detect(&:primary)
|
158
|
+
file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
|
159
|
+
file = File.join @output, "#{file_name}.#{@ext}"
|
160
|
+
if @files.include? file
|
161
|
+
warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
|
162
|
+
else
|
163
|
+
@files << file
|
164
|
+
index.add_or_update docid.to_h, file
|
165
|
+
File.write file, serialize(doc), encoding: "UTF-8"
|
166
|
+
end
|
167
|
+
iso_queue.move_last docpath
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# Serialize document to string.
|
172
|
+
#
|
173
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] doc document
|
174
|
+
#
|
175
|
+
# @return [String] serialized document
|
176
|
+
#
|
177
|
+
def serialize(doc)
|
178
|
+
case @format
|
179
|
+
when "yaml" then doc.to_hash.to_yaml
|
180
|
+
when "bibxml" then doc.to_bibxml
|
181
|
+
when "xml" then doc.to_xml bibdata: true
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
#
|
188
|
+
# Create thread worker
|
189
|
+
#
|
190
|
+
# @return [Thread] thread
|
191
|
+
#
|
192
|
+
def thread
|
193
|
+
Thread.new do
|
194
|
+
while (path = @queue.pop) != :END
|
195
|
+
yield path
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module RelatonIso
|
2
2
|
class DocumentIdentifier < RelatonBib::DocumentIdentifier
|
3
|
-
def id
|
3
|
+
def id # rubocop:disable Metrics/MethodLength
|
4
4
|
id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
|
5
5
|
if @all_parts
|
6
6
|
if type == "URN"
|
@@ -10,6 +10,12 @@ module RelatonIso
|
|
10
10
|
end
|
11
11
|
end
|
12
12
|
type == "URN" ? @id.urn.to_s : id_str
|
13
|
+
rescue Pubid::Iso::Errors::NoEditionError => e
|
14
|
+
Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
stringify_values(@id.to_h) if @id.respond_to? :to_h
|
13
19
|
end
|
14
20
|
|
15
21
|
def remove_part
|
@@ -23,5 +29,18 @@ module RelatonIso
|
|
23
29
|
def all_parts
|
24
30
|
@all_parts = true
|
25
31
|
end
|
32
|
+
|
33
|
+
def stringify_values(hash)
|
34
|
+
hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
|
35
|
+
end
|
36
|
+
|
37
|
+
def stringify(val)
|
38
|
+
case val
|
39
|
+
when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
|
40
|
+
when Hash then stringify_values(val)
|
41
|
+
when Symbol then val
|
42
|
+
else val.to_s
|
43
|
+
end
|
44
|
+
end
|
26
45
|
end
|
27
46
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
module HashConverter
|
3
|
+
include RelatonIsoBib::HashConverter
|
4
|
+
extend self
|
5
|
+
|
6
|
+
def create_docid(**args)
|
7
|
+
begin
|
8
|
+
args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
|
9
|
+
rescue StandardError
|
10
|
+
Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
|
11
|
+
end
|
12
|
+
DocumentIdentifier.new(**args)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/relaton_iso/hit.rb
CHANGED
@@ -4,28 +4,29 @@ module RelatonIso
|
|
4
4
|
# Hit.
|
5
5
|
class Hit < RelatonBib::Hit
|
6
6
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
7
|
-
attr_writer :fetch
|
7
|
+
attr_writer :fetch
|
8
|
+
|
9
|
+
# @return [Pubid::Iso::Identifier] pubid
|
10
|
+
attr_writer :pubid
|
8
11
|
|
9
12
|
# Update edition for pubid when provided in Bibliographic Item
|
10
|
-
def update_edition(bibliographic_item)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
else
|
16
|
-
pubid.edition = bibliographic_item.edition.content
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
13
|
+
# def update_edition(bibliographic_item)
|
14
|
+
# if bibliographic_item.edition
|
15
|
+
# pubid.root.edition = bibliographic_item.edition.content
|
16
|
+
# end
|
17
|
+
# end
|
20
18
|
|
21
19
|
# Parse page.
|
22
20
|
# @param lang [String, nil]
|
23
21
|
# @return [RelatonIso::IsoBibliographicItem]
|
24
|
-
def fetch(
|
25
|
-
@fetch ||=
|
26
|
-
|
27
|
-
|
28
|
-
|
22
|
+
def fetch(_lang = nil)
|
23
|
+
@fetch ||= begin
|
24
|
+
url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
|
25
|
+
resp = Net::HTTP.get_response URI(url)
|
26
|
+
hash = YAML.safe_load resp.body
|
27
|
+
hash["fetched"] = Date.today.to_s
|
28
|
+
RelatonIsoBib::IsoBibliographicItem.from_hash hash
|
29
|
+
end
|
29
30
|
end
|
30
31
|
|
31
32
|
# @return [Integer]
|
@@ -41,11 +42,18 @@ module RelatonIso
|
|
41
42
|
|
42
43
|
# @return [Pubid::Iso::Identifier]
|
43
44
|
def pubid
|
44
|
-
@pubid
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
Util.warn
|
45
|
+
return @pubid if defined? @pubid
|
46
|
+
|
47
|
+
create_pubid hit[:id]
|
48
|
+
rescue StandardError
|
49
|
+
Util.warn "Unable to create an identifier from #{hit[:id]}"
|
50
|
+
@pubid = nil
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def create_pubid(id)
|
56
|
+
@pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
|
49
57
|
end
|
50
58
|
end
|
51
59
|
end
|
@@ -6,82 +6,97 @@ require "relaton_iso/hit"
|
|
6
6
|
module RelatonIso
|
7
7
|
# Page of hit collection.
|
8
8
|
class HitCollection < RelatonBib::HitCollection
|
9
|
-
|
10
|
-
|
9
|
+
INDEXFILE = "index-v1.yaml"
|
10
|
+
ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
|
11
11
|
|
12
|
-
# @param text [
|
13
|
-
def initialize(
|
12
|
+
# @param text [Pubid::Iso::Identifier] reference to search
|
13
|
+
def initialize(pubid, opts = {})
|
14
14
|
super
|
15
|
-
@
|
15
|
+
@opts = opts
|
16
16
|
end
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
# @return [Pubid::Iso::Identifier]
|
19
|
+
alias ref_pubid text
|
20
|
+
|
21
|
+
def ref_pubid_no_year
|
22
|
+
@ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
|
23
|
+
end
|
24
|
+
|
25
|
+
def ref_pubid_excluded
|
26
|
+
@ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch # rubocop:disable Metrics/AbcSize
|
30
|
+
@array = index.search do |row|
|
31
|
+
row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
|
32
|
+
end.map { |row| Hit.new row, self }
|
33
|
+
.sort_by! { |h| h.pubid.to_s }
|
34
|
+
.reverse!
|
20
35
|
self
|
21
36
|
end
|
22
37
|
|
23
|
-
|
38
|
+
def pubid_match?(id)
|
39
|
+
pubid = create_pubid(id)
|
40
|
+
return false unless pubid
|
41
|
+
|
42
|
+
pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
|
43
|
+
dir_excludings = excludings.dup
|
44
|
+
dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
|
45
|
+
pubid.exclude(*dir_excludings) == ref_pubid_excluded
|
46
|
+
end
|
47
|
+
|
48
|
+
def create_pubid(id)
|
49
|
+
Pubid::Iso::Identifier.create(**id)
|
50
|
+
rescue StandardError => e
|
51
|
+
Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def excludings
|
56
|
+
return @excludings if defined? @excludings
|
57
|
+
|
58
|
+
excl_parts = %i[year]
|
59
|
+
excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
|
60
|
+
if ref_pubid.stage.nil? || @opts[:all_parts]
|
61
|
+
excl_parts << :stage
|
62
|
+
excl_parts << :iteration
|
63
|
+
end
|
64
|
+
# excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
|
65
|
+
@escludings = excl_parts
|
66
|
+
end
|
67
|
+
|
68
|
+
def index
|
69
|
+
@index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
|
70
|
+
end
|
71
|
+
|
72
|
+
def fetch_doc
|
73
|
+
if !@opts[:all_parts] || size == 1
|
74
|
+
any? && first.fetch(@opts[:lang])
|
75
|
+
else
|
76
|
+
to_all_parts(@opts[:lang])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# @param lang [String, nil]
|
24
81
|
# @return [RelatonIsoBib::IsoBibliographicItem, nil]
|
25
|
-
def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
|
26
|
-
# parts = @array.reject { |h| h.hit["docPart"]&.empty? }
|
82
|
+
def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
|
27
83
|
hit = @array.min_by { |h| h.pubid.part.to_i }
|
28
84
|
return @array.first&.fetch lang unless hit
|
29
85
|
|
30
86
|
bibitem = hit.fetch(lang)
|
31
87
|
all_parts_item = bibitem.to_all_parts
|
32
|
-
@array.reject { |h| h.
|
33
|
-
|
34
|
-
formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
|
35
|
-
)
|
36
|
-
all_parts_item.relation << RelatonBib::DocumentRelation.new(
|
37
|
-
type: "instanceOf", bibitem: isobib,
|
38
|
-
)
|
88
|
+
@array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
|
89
|
+
all_parts_item.relation << create_relation(hi)
|
39
90
|
end
|
40
91
|
all_parts_item
|
41
92
|
end
|
42
93
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
#
|
50
|
-
def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
51
|
-
ref = text.gsub(/[\s\/]/, "_").upcase
|
52
|
-
url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
|
53
|
-
resp = Net::HTTP.get_response URI(url)
|
54
|
-
return [] unless resp.code == "200"
|
55
|
-
|
56
|
-
hash = YAML.safe_load resp.body
|
57
|
-
bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
|
58
|
-
bib_hash[:fetched] = Date.today.to_s
|
59
|
-
bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
|
60
|
-
hit = Hit.new({ title: text }, self)
|
61
|
-
hit.fetch = bib
|
62
|
-
[hit]
|
63
|
-
end
|
64
|
-
|
65
|
-
#
|
66
|
-
# Fetch hits from iso.org
|
67
|
-
#
|
68
|
-
# @return [Array<RelatonIso::Hit>]
|
69
|
-
#
|
70
|
-
def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
71
|
-
config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
|
72
|
-
client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
|
73
|
-
index = client.init_index "all_en"
|
74
|
-
resp = index.search text, hitsPerPage: 100, filters: "category:standard"
|
75
|
-
|
76
|
-
resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
|
77
|
-
if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
|
78
|
-
a.hit[:title] <=> b.hit[:title]
|
79
|
-
elsif a.sort_weight == b.sort_weight
|
80
|
-
b.hit[:year] - a.hit[:year]
|
81
|
-
else
|
82
|
-
a.sort_weight - b.sort_weight
|
83
|
-
end
|
84
|
-
end
|
94
|
+
def create_relation(hit)
|
95
|
+
docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
|
96
|
+
isobib = RelatonIsoBib::IsoBibliographicItem.new(
|
97
|
+
formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
|
98
|
+
)
|
99
|
+
RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
|
85
100
|
end
|
86
101
|
end
|
87
102
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
# Index.
|
3
|
+
class Index
|
4
|
+
#
|
5
|
+
# Initialise index. If file path is given, read index from file. If file is not
|
6
|
+
# given, look for it in a `/home/USER/.relaton/iso` directory. If file
|
7
|
+
# doesn't exist, or is outdated then fetch index from GitHub.
|
8
|
+
#
|
9
|
+
# @param [String, nil] file path to index file.
|
10
|
+
#
|
11
|
+
def initialize(file = nil)
|
12
|
+
@file = file
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Create index.
|
17
|
+
#
|
18
|
+
# @return [Array<Hash>] index
|
19
|
+
#
|
20
|
+
def index
|
21
|
+
@index ||= read_index || read_from_user_dir || fetch_index
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Add or update index entry.
|
26
|
+
#
|
27
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] item document
|
28
|
+
#
|
29
|
+
# @return [void]
|
30
|
+
#
|
31
|
+
def <<(item)
|
32
|
+
id = item.docidentifier.detect(&:primary).id
|
33
|
+
row = self[id] || begin
|
34
|
+
r = { id: id }
|
35
|
+
index << r
|
36
|
+
r
|
37
|
+
end
|
38
|
+
row[:title] = item.title.first.title.content
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Fetch document from index by ID.
|
43
|
+
#
|
44
|
+
# @param [String] id document ID
|
45
|
+
#
|
46
|
+
# @return [Hash] index entry
|
47
|
+
#
|
48
|
+
def [](id)
|
49
|
+
index.detect { |i| i[:id] == id }
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Save index to file.
|
54
|
+
#
|
55
|
+
# @return [void]
|
56
|
+
#
|
57
|
+
def save
|
58
|
+
serialize_and_save index
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
#
|
64
|
+
# Serialize index and save to file.
|
65
|
+
#
|
66
|
+
# @param [Array<Hash>] idx index
|
67
|
+
#
|
68
|
+
# @return [void]
|
69
|
+
#
|
70
|
+
def serialize_and_save(idx)
|
71
|
+
File.open(@file, "w:UTF-8") do |f|
|
72
|
+
f.puts "---"
|
73
|
+
idx.each do |i|
|
74
|
+
f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Read index from file. If file doesn't exist, create empty index.
|
81
|
+
#
|
82
|
+
# @return [Array<Hash>, nil] index
|
83
|
+
#
|
84
|
+
def read_index
|
85
|
+
if @file && File.exist?(@file) then read_file
|
86
|
+
elsif @file then []
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
|
92
|
+
# if file doesn't exist, or is outdated.
|
93
|
+
#
|
94
|
+
# @return [Array<Hash>] index
|
95
|
+
#
|
96
|
+
def read_from_user_dir
|
97
|
+
@file = File.join(Dir.home, "index.yml")
|
98
|
+
read_file if File.exist?(@file) && !outdated?
|
99
|
+
end
|
100
|
+
|
101
|
+
def read_file
|
102
|
+
yaml = File.read @file, encoding: "UTF-8"
|
103
|
+
RelatonBib.parse_yaml yaml, [], symbolize_names: true
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Check if index file is outdated.
|
108
|
+
#
|
109
|
+
# @return [Boolean] true if older than 24 hours
|
110
|
+
#
|
111
|
+
def outdated?
|
112
|
+
(Time.now - File.mtime(@file)) / 3600 > 24
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Fetch index from GitHub.
|
117
|
+
#
|
118
|
+
# @return [Array<Hash>] index
|
119
|
+
#
|
120
|
+
def fetch_index
|
121
|
+
url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
|
122
|
+
zip = Zip::InputStream.new URI(url).open
|
123
|
+
yaml = zip.get_next_entry.get_input_stream.read
|
124
|
+
idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
|
125
|
+
serialize_and_save idx
|
126
|
+
idx
|
127
|
+
rescue OpenURI::HTTPError => e
|
128
|
+
warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
|
129
|
+
[]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|