relaton-iso 1.18.1 → 1.18.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_iso/data_fetcher.rb +200 -0
- data/lib/relaton_iso/document_identifier.rb +20 -1
- data/lib/relaton_iso/hash_converter.rb +15 -0
- data/lib/relaton_iso/hit.rb +29 -21
- data/lib/relaton_iso/hit_collection.rb +74 -59
- data/lib/relaton_iso/index.rb +132 -0
- data/lib/relaton_iso/iso_bibliography.rb +172 -180
- data/lib/relaton_iso/processor.rb +22 -2
- data/lib/relaton_iso/queue.rb +61 -0
- data/lib/relaton_iso/scrapper.rb +118 -70
- data/lib/relaton_iso/version.rb +1 -1
- data/lib/relaton_iso.rb +5 -0
- data/relaton_iso.gemspec +1 -0
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 395d9fbaf99042f03785dd5b529d1a1821b8e5939a378503b6af4898901363d5
|
4
|
+
data.tar.gz: 4745ddb2a95d4b5dfc2290afdc2116c0ed9f113cda3c717909ca9987eec33486
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb0e270a99fc7a4a8cd07bf0f28d7c5d157976ffaf436523dd9ff871950726af552447f4e4eea1802eb0a59856702fd9ca95757b1b7fea3a89f69e5f83d3f876
|
7
|
+
data.tar.gz: 35d93ed7fdead4846059a22485ff56652b727640d754a2e4523350706966107418c3fb723eb278abaf17460f80612394dc61bf9720907294d821cacbc84e6b7f
|
@@ -0,0 +1,200 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
# Fetch all the documents from ISO website.
|
3
|
+
class DataFetcher
|
4
|
+
#
|
5
|
+
# Initialize data fetcher.
|
6
|
+
#
|
7
|
+
# @param [String] output output directory
|
8
|
+
# @param [String] format format of output files (yaml, bibxml, xml)
|
9
|
+
#
|
10
|
+
def initialize(output, format)
|
11
|
+
@output = output
|
12
|
+
@format = format
|
13
|
+
@ext = format.sub(/^bib/, "")
|
14
|
+
@files = []
|
15
|
+
@queue = ::Queue.new
|
16
|
+
@mutex = Mutex.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def index
|
20
|
+
@index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
|
21
|
+
end
|
22
|
+
|
23
|
+
def iso_queue
|
24
|
+
@iso_queue ||= RelatonIso::Queue.new
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Initialize data fetcher and fetch data.
|
29
|
+
#
|
30
|
+
# @param [String] output output directory (default: "data")
|
31
|
+
# @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
|
32
|
+
#
|
33
|
+
# @return [void]
|
34
|
+
#
|
35
|
+
def self.fetch(output: "data", format: "yaml")
|
36
|
+
t1 = Time.now
|
37
|
+
puts "Started at: #{t1}"
|
38
|
+
FileUtils.mkdir_p output
|
39
|
+
new(output, format).fetch
|
40
|
+
t2 = Time.now
|
41
|
+
puts "Stopped at: #{t2}"
|
42
|
+
puts "Done in: #{(t2 - t1).round} sec."
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Go through all ICS and fetch all documents.
|
47
|
+
#
|
48
|
+
# @return [void]
|
49
|
+
#
|
50
|
+
def fetch # rubocop:disable Metrics/AbcSize
|
51
|
+
puts "Scrapping ICS pages..."
|
52
|
+
fetch_ics
|
53
|
+
puts "[#{Time.now}] Scrapping documents..."
|
54
|
+
fetch_docs
|
55
|
+
iso_queue.save
|
56
|
+
# index.sort! { |a, b| compare_docids a, b }
|
57
|
+
index.save
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Fetch ICS page recursively and store all the links to documents in the iso_queue.
|
62
|
+
#
|
63
|
+
# @param [String] path path to ICS page
|
64
|
+
#
|
65
|
+
def fetch_ics
|
66
|
+
threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
|
67
|
+
fetch_ics_page "/standards-catalogue/browse-by-ics.html"
|
68
|
+
sleep(1) until @queue.empty?
|
69
|
+
threads.size.times { @queue << :END }
|
70
|
+
threads.each(&:join)
|
71
|
+
end
|
72
|
+
|
73
|
+
def fetch_ics_page(path)
|
74
|
+
resp = get_redirection path
|
75
|
+
page = Nokogiri::HTML(resp.body)
|
76
|
+
page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
|
77
|
+
iso_queue.add_first item[:href].split("?").first
|
78
|
+
end
|
79
|
+
|
80
|
+
page.xpath("//td[@data-title='ICS']/a").each do |item|
|
81
|
+
@queue << item[:href]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# Get the page from the given path. If the page is redirected, get the
|
87
|
+
# page from the new path.
|
88
|
+
#
|
89
|
+
# @param [String] path path to the page
|
90
|
+
#
|
91
|
+
# @return [Net::HTTPOK] HTTP response
|
92
|
+
#
|
93
|
+
def get_redirection(path) # rubocop:disable Metrics/MethodLength
|
94
|
+
try = 0
|
95
|
+
uri = URI(Scrapper::DOMAIN + path)
|
96
|
+
begin
|
97
|
+
get_response uri
|
98
|
+
rescue Net::OpenTimeout, Net::ReadTimeout => e
|
99
|
+
try += 1
|
100
|
+
retry if check_try try, uri
|
101
|
+
|
102
|
+
warn "Error fetching #{uri}"
|
103
|
+
warn e.message
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_response(uri)
|
108
|
+
resp = Net::HTTP.get_response(uri)
|
109
|
+
resp.code == "302" ? get_redirection(resp["location"]) : resp
|
110
|
+
end
|
111
|
+
|
112
|
+
def check_try(try, uri)
|
113
|
+
if try < 3
|
114
|
+
warn "Timeout fetching #{uri}, retrying..."
|
115
|
+
sleep 1
|
116
|
+
true
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def fetch_docs
|
121
|
+
threads = Array.new(3) { thread { |path| fetch_doc(path) } }
|
122
|
+
iso_queue[0..10_000].each { |docpath| @queue << docpath }
|
123
|
+
threads.size.times { @queue << :END }
|
124
|
+
threads.each(&:join)
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Fetch document from ISO website.
|
129
|
+
#
|
130
|
+
# @param [String] docpath document page path
|
131
|
+
#
|
132
|
+
# @return [void]
|
133
|
+
#
|
134
|
+
def fetch_doc(docpath)
|
135
|
+
# path = docpath.sub(/\.html$/, "")
|
136
|
+
# hit = Hit.new({ path: docpath }, nil)
|
137
|
+
doc = Scrapper.parse_page docpath
|
138
|
+
@mutex.synchronize { save_doc doc, docpath }
|
139
|
+
rescue StandardError => e
|
140
|
+
warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
|
141
|
+
warn e.message
|
142
|
+
warn e.backtrace
|
143
|
+
end
|
144
|
+
|
145
|
+
# def compare_docids(id1, id2)
|
146
|
+
# Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
|
147
|
+
# end
|
148
|
+
|
149
|
+
#
|
150
|
+
# save document to file.
|
151
|
+
#
|
152
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] doc document
|
153
|
+
#
|
154
|
+
# @return [void]
|
155
|
+
#
|
156
|
+
def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
157
|
+
docid = doc.docidentifier.detect(&:primary)
|
158
|
+
file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
|
159
|
+
file = File.join @output, "#{file_name}.#{@ext}"
|
160
|
+
if @files.include? file
|
161
|
+
warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
|
162
|
+
else
|
163
|
+
@files << file
|
164
|
+
index.add_or_update docid.to_h, file
|
165
|
+
File.write file, serialize(doc), encoding: "UTF-8"
|
166
|
+
end
|
167
|
+
iso_queue.move_last docpath
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# Serialize document to string.
|
172
|
+
#
|
173
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] doc document
|
174
|
+
#
|
175
|
+
# @return [String] serialized document
|
176
|
+
#
|
177
|
+
def serialize(doc)
|
178
|
+
case @format
|
179
|
+
when "yaml" then doc.to_hash.to_yaml
|
180
|
+
when "bibxml" then doc.to_bibxml
|
181
|
+
when "xml" then doc.to_xml bibdata: true
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
private
|
186
|
+
|
187
|
+
#
|
188
|
+
# Create thread worker
|
189
|
+
#
|
190
|
+
# @return [Thread] thread
|
191
|
+
#
|
192
|
+
def thread
|
193
|
+
Thread.new do
|
194
|
+
while (path = @queue.pop) != :END
|
195
|
+
yield path
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module RelatonIso
|
2
2
|
class DocumentIdentifier < RelatonBib::DocumentIdentifier
|
3
|
-
def id
|
3
|
+
def id # rubocop:disable Metrics/MethodLength
|
4
4
|
id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
|
5
5
|
if @all_parts
|
6
6
|
if type == "URN"
|
@@ -10,6 +10,12 @@ module RelatonIso
|
|
10
10
|
end
|
11
11
|
end
|
12
12
|
type == "URN" ? @id.urn.to_s : id_str
|
13
|
+
rescue Pubid::Iso::Errors::NoEditionError => e
|
14
|
+
Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
stringify_values(@id.to_h) if @id.respond_to? :to_h
|
13
19
|
end
|
14
20
|
|
15
21
|
def remove_part
|
@@ -23,5 +29,18 @@ module RelatonIso
|
|
23
29
|
def all_parts
|
24
30
|
@all_parts = true
|
25
31
|
end
|
32
|
+
|
33
|
+
def stringify_values(hash)
|
34
|
+
hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
|
35
|
+
end
|
36
|
+
|
37
|
+
def stringify(val)
|
38
|
+
case val
|
39
|
+
when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
|
40
|
+
when Hash then stringify_values(val)
|
41
|
+
when Symbol then val
|
42
|
+
else val.to_s
|
43
|
+
end
|
44
|
+
end
|
26
45
|
end
|
27
46
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
module HashConverter
|
3
|
+
include RelatonIsoBib::HashConverter
|
4
|
+
extend self
|
5
|
+
|
6
|
+
def create_docid(**args)
|
7
|
+
begin
|
8
|
+
args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
|
9
|
+
rescue StandardError
|
10
|
+
Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
|
11
|
+
end
|
12
|
+
DocumentIdentifier.new(**args)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/relaton_iso/hit.rb
CHANGED
@@ -4,28 +4,29 @@ module RelatonIso
|
|
4
4
|
# Hit.
|
5
5
|
class Hit < RelatonBib::Hit
|
6
6
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
7
|
-
attr_writer :fetch
|
7
|
+
attr_writer :fetch
|
8
|
+
|
9
|
+
# @return [Pubid::Iso::Identifier] pubid
|
10
|
+
attr_writer :pubid
|
8
11
|
|
9
12
|
# Update edition for pubid when provided in Bibliographic Item
|
10
|
-
def update_edition(bibliographic_item)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
else
|
16
|
-
pubid.edition = bibliographic_item.edition.content
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
13
|
+
# def update_edition(bibliographic_item)
|
14
|
+
# if bibliographic_item.edition
|
15
|
+
# pubid.root.edition = bibliographic_item.edition.content
|
16
|
+
# end
|
17
|
+
# end
|
20
18
|
|
21
19
|
# Parse page.
|
22
20
|
# @param lang [String, nil]
|
23
21
|
# @return [RelatonIso::IsoBibliographicItem]
|
24
|
-
def fetch(
|
25
|
-
@fetch ||=
|
26
|
-
|
27
|
-
|
28
|
-
|
22
|
+
def fetch(_lang = nil)
|
23
|
+
@fetch ||= begin
|
24
|
+
url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
|
25
|
+
resp = Net::HTTP.get_response URI(url)
|
26
|
+
hash = YAML.safe_load resp.body
|
27
|
+
hash["fetched"] = Date.today.to_s
|
28
|
+
RelatonIsoBib::IsoBibliographicItem.from_hash hash
|
29
|
+
end
|
29
30
|
end
|
30
31
|
|
31
32
|
# @return [Integer]
|
@@ -41,11 +42,18 @@ module RelatonIso
|
|
41
42
|
|
42
43
|
# @return [Pubid::Iso::Identifier]
|
43
44
|
def pubid
|
44
|
-
@pubid
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
Util.warn
|
45
|
+
return @pubid if defined? @pubid
|
46
|
+
|
47
|
+
create_pubid hit[:id]
|
48
|
+
rescue StandardError
|
49
|
+
Util.warn "Unable to create an identifier from #{hit[:id]}"
|
50
|
+
@pubid = nil
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def create_pubid(id)
|
56
|
+
@pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
|
49
57
|
end
|
50
58
|
end
|
51
59
|
end
|
@@ -6,82 +6,97 @@ require "relaton_iso/hit"
|
|
6
6
|
module RelatonIso
|
7
7
|
# Page of hit collection.
|
8
8
|
class HitCollection < RelatonBib::HitCollection
|
9
|
-
|
10
|
-
|
9
|
+
INDEXFILE = "index-v1.yaml"
|
10
|
+
ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
|
11
11
|
|
12
|
-
# @param text [
|
13
|
-
def initialize(
|
12
|
+
# @param text [Pubid::Iso::Identifier] reference to search
|
13
|
+
def initialize(pubid, opts = {})
|
14
14
|
super
|
15
|
-
@
|
15
|
+
@opts = opts
|
16
16
|
end
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
# @return [Pubid::Iso::Identifier]
|
19
|
+
alias ref_pubid text
|
20
|
+
|
21
|
+
def ref_pubid_no_year
|
22
|
+
@ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
|
23
|
+
end
|
24
|
+
|
25
|
+
def ref_pubid_excluded
|
26
|
+
@ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch # rubocop:disable Metrics/AbcSize
|
30
|
+
@array = index.search do |row|
|
31
|
+
row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
|
32
|
+
end.map { |row| Hit.new row, self }
|
33
|
+
.sort_by! { |h| h.pubid.to_s }
|
34
|
+
.reverse!
|
20
35
|
self
|
21
36
|
end
|
22
37
|
|
23
|
-
|
38
|
+
def pubid_match?(id)
|
39
|
+
pubid = create_pubid(id)
|
40
|
+
return false unless pubid
|
41
|
+
|
42
|
+
pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
|
43
|
+
dir_excludings = excludings.dup
|
44
|
+
dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
|
45
|
+
pubid.exclude(*dir_excludings) == ref_pubid_excluded
|
46
|
+
end
|
47
|
+
|
48
|
+
def create_pubid(id)
|
49
|
+
Pubid::Iso::Identifier.create(**id)
|
50
|
+
rescue StandardError => e
|
51
|
+
Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
|
52
|
+
nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def excludings
|
56
|
+
return @excludings if defined? @excludings
|
57
|
+
|
58
|
+
excl_parts = %i[year]
|
59
|
+
excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
|
60
|
+
if ref_pubid.stage.nil? || @opts[:all_parts]
|
61
|
+
excl_parts << :stage
|
62
|
+
excl_parts << :iteration
|
63
|
+
end
|
64
|
+
# excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
|
65
|
+
@escludings = excl_parts
|
66
|
+
end
|
67
|
+
|
68
|
+
def index
|
69
|
+
@index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
|
70
|
+
end
|
71
|
+
|
72
|
+
def fetch_doc
|
73
|
+
if !@opts[:all_parts] || size == 1
|
74
|
+
any? && first.fetch(@opts[:lang])
|
75
|
+
else
|
76
|
+
to_all_parts(@opts[:lang])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# @param lang [String, nil]
|
24
81
|
# @return [RelatonIsoBib::IsoBibliographicItem, nil]
|
25
|
-
def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
|
26
|
-
# parts = @array.reject { |h| h.hit["docPart"]&.empty? }
|
82
|
+
def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
|
27
83
|
hit = @array.min_by { |h| h.pubid.part.to_i }
|
28
84
|
return @array.first&.fetch lang unless hit
|
29
85
|
|
30
86
|
bibitem = hit.fetch(lang)
|
31
87
|
all_parts_item = bibitem.to_all_parts
|
32
|
-
@array.reject { |h| h.
|
33
|
-
|
34
|
-
formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
|
35
|
-
)
|
36
|
-
all_parts_item.relation << RelatonBib::DocumentRelation.new(
|
37
|
-
type: "instanceOf", bibitem: isobib,
|
38
|
-
)
|
88
|
+
@array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
|
89
|
+
all_parts_item.relation << create_relation(hi)
|
39
90
|
end
|
40
91
|
all_parts_item
|
41
92
|
end
|
42
93
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
#
|
50
|
-
def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
51
|
-
ref = text.gsub(/[\s\/]/, "_").upcase
|
52
|
-
url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
|
53
|
-
resp = Net::HTTP.get_response URI(url)
|
54
|
-
return [] unless resp.code == "200"
|
55
|
-
|
56
|
-
hash = YAML.safe_load resp.body
|
57
|
-
bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
|
58
|
-
bib_hash[:fetched] = Date.today.to_s
|
59
|
-
bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
|
60
|
-
hit = Hit.new({ title: text }, self)
|
61
|
-
hit.fetch = bib
|
62
|
-
[hit]
|
63
|
-
end
|
64
|
-
|
65
|
-
#
|
66
|
-
# Fetch hits from iso.org
|
67
|
-
#
|
68
|
-
# @return [Array<RelatonIso::Hit>]
|
69
|
-
#
|
70
|
-
def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
71
|
-
config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
|
72
|
-
client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
|
73
|
-
index = client.init_index "all_en"
|
74
|
-
resp = index.search text, hitsPerPage: 100, filters: "category:standard"
|
75
|
-
|
76
|
-
resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
|
77
|
-
if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
|
78
|
-
a.hit[:title] <=> b.hit[:title]
|
79
|
-
elsif a.sort_weight == b.sort_weight
|
80
|
-
b.hit[:year] - a.hit[:year]
|
81
|
-
else
|
82
|
-
a.sort_weight - b.sort_weight
|
83
|
-
end
|
84
|
-
end
|
94
|
+
def create_relation(hit)
|
95
|
+
docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
|
96
|
+
isobib = RelatonIsoBib::IsoBibliographicItem.new(
|
97
|
+
formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
|
98
|
+
)
|
99
|
+
RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
|
85
100
|
end
|
86
101
|
end
|
87
102
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module RelatonIso
|
2
|
+
# Index.
|
3
|
+
class Index
|
4
|
+
#
|
5
|
+
# Initialise index. If file path is given, read index from file. If file is not
|
6
|
+
# given, look for it in a `/home/USER/.relaton/iso` directory. If file
|
7
|
+
# doesn't exist, or is outdated then fetch index from GitHub.
|
8
|
+
#
|
9
|
+
# @param [String, nil] file path to index file.
|
10
|
+
#
|
11
|
+
def initialize(file = nil)
|
12
|
+
@file = file
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Create index.
|
17
|
+
#
|
18
|
+
# @return [Array<Hash>] index
|
19
|
+
#
|
20
|
+
def index
|
21
|
+
@index ||= read_index || read_from_user_dir || fetch_index
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Add or update index entry.
|
26
|
+
#
|
27
|
+
# @param [RelatonIsoBib::IsoBibliographicItem] item document
|
28
|
+
#
|
29
|
+
# @return [void]
|
30
|
+
#
|
31
|
+
def <<(item)
|
32
|
+
id = item.docidentifier.detect(&:primary).id
|
33
|
+
row = self[id] || begin
|
34
|
+
r = { id: id }
|
35
|
+
index << r
|
36
|
+
r
|
37
|
+
end
|
38
|
+
row[:title] = item.title.first.title.content
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Fetch document from index by ID.
|
43
|
+
#
|
44
|
+
# @param [String] id document ID
|
45
|
+
#
|
46
|
+
# @return [Hash] index entry
|
47
|
+
#
|
48
|
+
def [](id)
|
49
|
+
index.detect { |i| i[:id] == id }
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Save index to file.
|
54
|
+
#
|
55
|
+
# @return [void]
|
56
|
+
#
|
57
|
+
def save
|
58
|
+
serialize_and_save index
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
#
|
64
|
+
# Serialize index and save to file.
|
65
|
+
#
|
66
|
+
# @param [Array<Hash>] idx index
|
67
|
+
#
|
68
|
+
# @return [void]
|
69
|
+
#
|
70
|
+
def serialize_and_save(idx)
|
71
|
+
File.open(@file, "w:UTF-8") do |f|
|
72
|
+
f.puts "---"
|
73
|
+
idx.each do |i|
|
74
|
+
f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Read index from file. If file doesn't exist, create empty index.
|
81
|
+
#
|
82
|
+
# @return [Array<Hash>, nil] index
|
83
|
+
#
|
84
|
+
def read_index
|
85
|
+
if @file && File.exist?(@file) then read_file
|
86
|
+
elsif @file then []
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
|
92
|
+
# if file doesn't exist, or is outdated.
|
93
|
+
#
|
94
|
+
# @return [Array<Hash>] index
|
95
|
+
#
|
96
|
+
def read_from_user_dir
|
97
|
+
@file = File.join(Dir.home, "index.yml")
|
98
|
+
read_file if File.exist?(@file) && !outdated?
|
99
|
+
end
|
100
|
+
|
101
|
+
def read_file
|
102
|
+
yaml = File.read @file, encoding: "UTF-8"
|
103
|
+
RelatonBib.parse_yaml yaml, [], symbolize_names: true
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Check if index file is outdated.
|
108
|
+
#
|
109
|
+
# @return [Boolean] true if older than 24 hours
|
110
|
+
#
|
111
|
+
def outdated?
|
112
|
+
(Time.now - File.mtime(@file)) / 3600 > 24
|
113
|
+
end
|
114
|
+
|
115
|
+
#
|
116
|
+
# Fetch index from GitHub.
|
117
|
+
#
|
118
|
+
# @return [Array<Hash>] index
|
119
|
+
#
|
120
|
+
def fetch_index
|
121
|
+
url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
|
122
|
+
zip = Zip::InputStream.new URI(url).open
|
123
|
+
yaml = zip.get_next_entry.get_input_stream.read
|
124
|
+
idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
|
125
|
+
serialize_and_save idx
|
126
|
+
idx
|
127
|
+
rescue OpenURI::HTTPError => e
|
128
|
+
warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
|
129
|
+
[]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|