relaton-iso 1.18.1 → 1.18.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a483674664dc939f3ccf8b46d36e275db5426d9c164e0ec1a799ea321cb47694
4
- data.tar.gz: 207870aaa9b1d89dc5dddd5922aac0b1c77c43b8bd0be292b2ffda7ccaeb712b
3
+ metadata.gz: 9a8570714304ad5ccdf68c5861a53b346f14235e0839bcaeba9b5221d93d0b4f
4
+ data.tar.gz: 39b230964324c2753d24440bb545233771251b2fcf2e99ef9774c9fc90f10593
5
5
  SHA512:
6
- metadata.gz: 39b94b85564592a94934e449d92ebc7fe5b90d536cba83947d179791b57fda3618a47853e1776822498e2654fa910c2ddc71b126c539655794917d790bd8f56a
7
- data.tar.gz: f34353074f0e0eb354b14b029726a964845fffbcf059bf6b9c5ef6e6a2cf0921dc84a25afc092c2b2da4088d3b833392c03dd20054b83195f40b16485a9ac04c
6
+ metadata.gz: a5b9cec3bbd724d0c0ea0fcabe4f499e57c1effa4c9aa4abeb3323b143608997de21a6013dd89f88048c811442139e44a2b61221abdcee08af688522314f2478
7
+ data.tar.gz: a4c3923795a224b728a7353988fc1f51f443cfbcec330de7ef2a45ee8d568193aa928ccd9eb790b551300ee75de084f7d5a2f01286a6babb95dd9d8e0d1264c4
@@ -0,0 +1,200 @@
1
+ module RelatonIso
2
+ # Fetch all the documents from ISO website.
3
+ class DataFetcher
4
+ #
5
+ # Initialize data fetcher.
6
+ #
7
+ # @param [String] output output directory
8
+ # @param [String] format format of output files (yaml, bibxml, xml)
9
+ #
10
+ def initialize(output, format)
11
+ @output = output
12
+ @format = format
13
+ @ext = format.sub(/^bib/, "")
14
+ @files = []
15
+ @queue = ::Queue.new
16
+ @mutex = Mutex.new
17
+ end
18
+
19
+ def index
20
+ @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
21
+ end
22
+
23
+ def iso_queue
24
+ @iso_queue ||= RelatonIso::Queue.new
25
+ end
26
+
27
+ #
28
+ # Initialize data fetcher and fetch data.
29
+ #
30
+ # @param [String] output output directory (default: "data")
31
+ # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
32
+ #
33
+ # @return [void]
34
+ #
35
+ def self.fetch(output: "data", format: "yaml")
36
+ t1 = Time.now
37
+ puts "Started at: #{t1}"
38
+ FileUtils.mkdir_p output
39
+ new(output, format).fetch
40
+ t2 = Time.now
41
+ puts "Stopped at: #{t2}"
42
+ puts "Done in: #{(t2 - t1).round} sec."
43
+ end
44
+
45
+ #
46
+ # Go through all ICS and fetch all documents.
47
+ #
48
+ # @return [void]
49
+ #
50
+ def fetch # rubocop:disable Metrics/AbcSize
51
+ puts "Scrapping ICS pages..."
52
+ fetch_ics
53
+ puts "[#{Time.now}] Scrapping documents..."
54
+ fetch_docs
55
+ iso_queue.save
56
+ # index.sort! { |a, b| compare_docids a, b }
57
+ index.save
58
+ end
59
+
60
+ #
61
+ # Fetch ICS page recursively and store all the links to documents in the iso_queue.
62
+ #
63
+ # @param [String] path path to ICS page
64
+ #
65
+ def fetch_ics
66
+ threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
67
+ fetch_ics_page "/standards-catalogue/browse-by-ics.html"
68
+ sleep(1) until @queue.empty?
69
+ threads.size.times { @queue << :END }
70
+ threads.each(&:join)
71
+ end
72
+
73
+ def fetch_ics_page(path)
74
+ resp = get_redirection path
75
+ page = Nokogiri::HTML(resp.body)
76
+ page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
77
+ iso_queue.add_first item[:href].split("?").first
78
+ end
79
+
80
+ page.xpath("//td[@data-title='ICS']/a").each do |item|
81
+ @queue << item[:href]
82
+ end
83
+ end
84
+
85
+ #
86
+ # Get the page from the given path. If the page is redirected, get the
87
+ # page from the new path.
88
+ #
89
+ # @param [String] path path to the page
90
+ #
91
+ # @return [Net::HTTPOK] HTTP response
92
+ #
93
+ def get_redirection(path) # rubocop:disable Metrics/MethodLength
94
+ try = 0
95
+ uri = URI(Scrapper::DOMAIN + path)
96
+ begin
97
+ get_response uri
98
+ rescue Net::OpenTimeout, Net::ReadTimeout => e
99
+ try += 1
100
+ retry if check_try try, uri
101
+
102
+ warn "Error fetching #{uri}"
103
+ warn e.message
104
+ end
105
+ end
106
+
107
+ def get_response(uri)
108
+ resp = Net::HTTP.get_response(uri)
109
+ resp.code == "302" ? get_redirection(resp["location"]) : resp
110
+ end
111
+
112
+ def check_try(try, uri)
113
+ if try < 3
114
+ warn "Timeout fetching #{uri}, retrying..."
115
+ sleep 1
116
+ true
117
+ end
118
+ end
119
+
120
+ def fetch_docs
121
+ threads = Array.new(3) { thread { |path| fetch_doc(path) } }
122
+ iso_queue[0..10_000].each { |docpath| @queue << docpath }
123
+ threads.size.times { @queue << :END }
124
+ threads.each(&:join)
125
+ end
126
+
127
+ #
128
+ # Fetch document from ISO website.
129
+ #
130
+ # @param [String] docpath document page path
131
+ #
132
+ # @return [void]
133
+ #
134
+ def fetch_doc(docpath)
135
+ # path = docpath.sub(/\.html$/, "")
136
+ # hit = Hit.new({ path: docpath }, nil)
137
+ doc = Scrapper.parse_page docpath
138
+ @mutex.synchronize { save_doc doc, docpath }
139
+ rescue StandardError => e
140
+ warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
141
+ warn e.message
142
+ warn e.backtrace
143
+ end
144
+
145
+ # def compare_docids(id1, id2)
146
+ # Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
147
+ # end
148
+
149
+ #
150
+ # save document to file.
151
+ #
152
+ # @param [RelatonIsoBib::IsoBibliographicItem] doc document
153
+ #
154
+ # @return [void]
155
+ #
156
+ def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
157
+ docid = doc.docidentifier.detect(&:primary)
158
+ file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
159
+ file = File.join @output, "#{file_name}.#{@ext}"
160
+ if @files.include? file
161
+ warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
162
+ else
163
+ @files << file
164
+ index.add_or_update docid.to_h, file
165
+ File.write file, serialize(doc), encoding: "UTF-8"
166
+ end
167
+ iso_queue.move_last docpath
168
+ end
169
+
170
+ #
171
+ # Serialize document to string.
172
+ #
173
+ # @param [RelatonIsoBib::IsoBibliographicItem] doc document
174
+ #
175
+ # @return [String] serialized document
176
+ #
177
+ def serialize(doc)
178
+ case @format
179
+ when "yaml" then doc.to_hash.to_yaml
180
+ when "bibxml" then doc.to_bibxml
181
+ when "xml" then doc.to_xml bibdata: true
182
+ end
183
+ end
184
+
185
+ private
186
+
187
+ #
188
+ # Create thread worker
189
+ #
190
+ # @return [Thread] thread
191
+ #
192
+ def thread
193
+ Thread.new do
194
+ while (path = @queue.pop) != :END
195
+ yield path
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -1,6 +1,6 @@
1
1
  module RelatonIso
2
2
  class DocumentIdentifier < RelatonBib::DocumentIdentifier
3
- def id
3
+ def id # rubocop:disable Metrics/MethodLength
4
4
  id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
5
5
  if @all_parts
6
6
  if type == "URN"
@@ -10,6 +10,12 @@ module RelatonIso
10
10
  end
11
11
  end
12
12
  type == "URN" ? @id.urn.to_s : id_str
13
+ rescue Pubid::Iso::Errors::NoEditionError => e
14
+ Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
15
+ end
16
+
17
+ def to_h
18
+ stringify_values(@id.to_h) if @id.respond_to? :to_h
13
19
  end
14
20
 
15
21
  def remove_part
@@ -23,5 +29,18 @@ module RelatonIso
23
29
  def all_parts
24
30
  @all_parts = true
25
31
  end
32
+
33
+ def stringify_values(hash)
34
+ hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
35
+ end
36
+
37
+ def stringify(val)
38
+ case val
39
+ when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
40
+ when Hash then stringify_values(val)
41
+ when Symbol then val
42
+ else val.to_s
43
+ end
44
+ end
26
45
  end
27
46
  end
@@ -0,0 +1,15 @@
1
+ module RelatonIso
2
+ module HashConverter
3
+ include RelatonIsoBib::HashConverter
4
+ extend self
5
+
6
+ def create_docid(**args)
7
+ begin
8
+ args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
9
+ rescue StandardError
10
+ Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
11
+ end
12
+ DocumentIdentifier.new(**args)
13
+ end
14
+ end
15
+ end
@@ -4,28 +4,29 @@ module RelatonIso
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # @return [RelatonIsoBib::IsoBibliographicItem]
7
- attr_writer :fetch, :pubid
7
+ attr_writer :fetch
8
+
9
+ # @return [Pubid::Iso::Identifier] pubid
10
+ attr_writer :pubid
8
11
 
9
12
  # Update edition for pubid when provided in Bibliographic Item
10
- def update_edition(bibliographic_item)
11
- if bibliographic_item.edition
12
- # add edition to base document if available
13
- if pubid.base
14
- pubid.base.edition = bibliographic_item.edition.content
15
- else
16
- pubid.edition = bibliographic_item.edition.content
17
- end
18
- end
19
- end
13
+ # def update_edition(bibliographic_item)
14
+ # if bibliographic_item.edition
15
+ # pubid.root.edition = bibliographic_item.edition.content
16
+ # end
17
+ # end
20
18
 
21
19
  # Parse page.
22
20
  # @param lang [String, nil]
23
21
  # @return [RelatonIso::IsoBibliographicItem]
24
- def fetch(lang = nil)
25
- @fetch ||= Scrapper.parse_page self, lang
26
- # update edition for pubid using fetched data
27
- update_edition(@fetch)
28
- @fetch
22
+ def fetch(_lang = nil)
23
+ @fetch ||= begin
24
+ url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
25
+ resp = Net::HTTP.get_response URI(url)
26
+ hash = YAML.safe_load resp.body
27
+ hash["fetched"] = Date.today.to_s
28
+ RelatonIsoBib::IsoBibliographicItem.from_hash hash
29
+ end
29
30
  end
30
31
 
31
32
  # @return [Integer]
@@ -41,11 +42,18 @@ module RelatonIso
41
42
 
42
43
  # @return [Pubid::Iso::Identifier]
43
44
  def pubid
44
- @pubid ||= Pubid::Iso::Identifier.parse_from_title(hit[:title])
45
- rescue Pubid::Iso::Errors::WrongTypeError,
46
- Pubid::Iso::Errors::ParseError => e
47
- Util.warn "Unable to find an identifier in: `#{hit[:title]}`."
48
- Util.warn e.message
45
+ return @pubid if defined? @pubid
46
+
47
+ create_pubid hit[:id]
48
+ rescue StandardError
49
+ Util.warn "Unable to create an identifier from #{hit[:id]}"
50
+ @pubid = nil
51
+ end
52
+
53
+ private
54
+
55
+ def create_pubid(id)
56
+ @pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
49
57
  end
50
58
  end
51
59
  end
@@ -1,87 +1,101 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "algolia"
4
3
  require "relaton_iso/hit"
5
4
 
6
5
  module RelatonIso
7
6
  # Page of hit collection.
8
7
  class HitCollection < RelatonBib::HitCollection
9
- # @return [Boolean] whether the search was performed on GitHub
10
- attr_reader :from_gh
8
+ INDEXFILE = "index-v1.yaml"
9
+ ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
11
10
 
12
- # @param text [String] reference to search
13
- def initialize(text)
11
+ # @param text [Pubid::Iso::Identifier] reference to search
12
+ def initialize(pubid, opts = {})
14
13
  super
15
- @from_gh = text.match?(/^ISO[\s\/](?:TC\s184\/SC\s?4|IEC\sDIR\s(?:\d|IEC|JTC))/)
14
+ @opts = opts
16
15
  end
17
16
 
18
- def fetch
19
- @array = from_gh ? fetch_github : fetch_iso
17
+ # @return [Pubid::Iso::Identifier]
18
+ alias ref_pubid text
19
+
20
+ def ref_pubid_no_year
21
+ @ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
22
+ end
23
+
24
+ def ref_pubid_excluded
25
+ @ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
26
+ end
27
+
28
+ def fetch # rubocop:disable Metrics/AbcSize
29
+ @array = index.search do |row|
30
+ row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
31
+ end.map { |row| Hit.new row, self }
32
+ .sort_by! { |h| h.pubid.to_s }
33
+ .reverse!
20
34
  self
21
35
  end
22
36
 
23
- # @param lang [String, NilClass]
37
+ def pubid_match?(id)
38
+ pubid = create_pubid(id)
39
+ return false unless pubid
40
+
41
+ pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
42
+ dir_excludings = excludings.dup
43
+ dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
44
+ pubid.exclude(*dir_excludings) == ref_pubid_excluded
45
+ end
46
+
47
+ def create_pubid(id)
48
+ Pubid::Iso::Identifier.create(**id)
49
+ rescue StandardError => e
50
+ Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
51
+ nil
52
+ end
53
+
54
+ def excludings
55
+ return @excludings if defined? @excludings
56
+
57
+ excl_parts = %i[year]
58
+ excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
59
+ if ref_pubid.stage.nil? || @opts[:all_parts]
60
+ excl_parts << :stage
61
+ excl_parts << :iteration
62
+ end
63
+ # excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
64
+ @escludings = excl_parts
65
+ end
66
+
67
+ def index
68
+ @index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
69
+ end
70
+
71
+ def fetch_doc
72
+ if !@opts[:all_parts] || size == 1
73
+ any? && first.fetch(@opts[:lang])
74
+ else
75
+ to_all_parts(@opts[:lang])
76
+ end
77
+ end
78
+
79
+ # @param lang [String, nil]
24
80
  # @return [RelatonIsoBib::IsoBibliographicItem, nil]
25
- def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
26
- # parts = @array.reject { |h| h.hit["docPart"]&.empty? }
81
+ def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
27
82
  hit = @array.min_by { |h| h.pubid.part.to_i }
28
83
  return @array.first&.fetch lang unless hit
29
84
 
30
85
  bibitem = hit.fetch(lang)
31
86
  all_parts_item = bibitem.to_all_parts
32
- @array.reject { |h| h.hit[:uuid] == hit.hit[:uuid] }.each do |hi|
33
- isobib = RelatonIsoBib::IsoBibliographicItem.new(
34
- formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
35
- )
36
- all_parts_item.relation << RelatonBib::DocumentRelation.new(
37
- type: "instanceOf", bibitem: isobib,
38
- )
87
+ @array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
88
+ all_parts_item.relation << create_relation(hi)
39
89
  end
40
90
  all_parts_item
41
91
  end
42
92
 
43
- private
44
-
45
- #
46
- # Fetch document from GitHub repository
47
- #
48
- # @return [Array<RelatonIso::Hit]
49
- #
50
- def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
51
- ref = text.gsub(/[\s\/]/, "_").upcase
52
- url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
53
- resp = Net::HTTP.get_response URI(url)
54
- return [] unless resp.code == "200"
55
-
56
- hash = YAML.safe_load resp.body
57
- bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
58
- bib_hash[:fetched] = Date.today.to_s
59
- bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
60
- hit = Hit.new({ title: text }, self)
61
- hit.fetch = bib
62
- [hit]
63
- end
64
-
65
- #
66
- # Fetch hits from iso.org
67
- #
68
- # @return [Array<RelatonIso::Hit>]
69
- #
70
- def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
71
- config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
72
- client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
73
- index = client.init_index "all_en"
74
- resp = index.search text, hitsPerPage: 100, filters: "category:standard"
75
-
76
- resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
77
- if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
78
- a.hit[:title] <=> b.hit[:title]
79
- elsif a.sort_weight == b.sort_weight
80
- b.hit[:year] - a.hit[:year]
81
- else
82
- a.sort_weight - b.sort_weight
83
- end
84
- end
93
+ def create_relation(hit)
94
+ docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
95
+ isobib = RelatonIsoBib::IsoBibliographicItem.new(
96
+ formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
97
+ )
98
+ RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
85
99
  end
86
100
  end
87
101
  end
@@ -0,0 +1,132 @@
1
+ module RelatonIso
2
+ # Index.
3
+ class Index
4
+ #
5
+ # Initialise index. If file path is given, read index from file. If file is not
6
+ # given, look for it in a `/home/USER/.relaton/iso` directory. If file
7
+ # doesn't exist, or is outdated then fetch index from GitHub.
8
+ #
9
+ # @param [String, nil] file path to index file.
10
+ #
11
+ def initialize(file = nil)
12
+ @file = file
13
+ end
14
+
15
+ #
16
+ # Create index.
17
+ #
18
+ # @return [Array<Hash>] index
19
+ #
20
+ def index
21
+ @index ||= read_index || read_from_user_dir || fetch_index
22
+ end
23
+
24
+ #
25
+ # Add or update index entry.
26
+ #
27
+ # @param [RelatonIsoBib::IsoBibliographicItem] item document
28
+ #
29
+ # @return [void]
30
+ #
31
+ def <<(item)
32
+ id = item.docidentifier.detect(&:primary).id
33
+ row = self[id] || begin
34
+ r = { id: id }
35
+ index << r
36
+ r
37
+ end
38
+ row[:title] = item.title.first.title.content
39
+ end
40
+
41
+ #
42
+ # Fetch document from index by ID.
43
+ #
44
+ # @param [String] id document ID
45
+ #
46
+ # @return [Hash] index entry
47
+ #
48
+ def [](id)
49
+ index.detect { |i| i[:id] == id }
50
+ end
51
+
52
+ #
53
+ # Save index to file.
54
+ #
55
+ # @return [void]
56
+ #
57
+ def save
58
+ serialize_and_save index
59
+ end
60
+
61
+ private
62
+
63
+ #
64
+ # Serialize index and save to file.
65
+ #
66
+ # @param [Array<Hash>] idx index
67
+ #
68
+ # @return [void]
69
+ #
70
+ def serialize_and_save(idx)
71
+ File.open(@file, "w:UTF-8") do |f|
72
+ f.puts "---"
73
+ idx.each do |i|
74
+ f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
75
+ end
76
+ end
77
+ end
78
+
79
+ #
80
+ # Read index from file. If file doesn't exist, create empty index.
81
+ #
82
+ # @return [Array<Hash>, nil] index
83
+ #
84
+ def read_index
85
+ if @file && File.exist?(@file) then read_file
86
+ elsif @file then []
87
+ end
88
+ end
89
+
90
+ #
91
+ # Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
92
+ # if file doesn't exist, or is outdated.
93
+ #
94
+ # @return [Array<Hash>] index
95
+ #
96
+ def read_from_user_dir
97
+ @file = File.join(Dir.home, "index.yml")
98
+ read_file if File.exist?(@file) && !outdated?
99
+ end
100
+
101
+ def read_file
102
+ yaml = File.read @file, encoding: "UTF-8"
103
+ RelatonBib.parse_yaml yaml, [], symbolize_names: true
104
+ end
105
+
106
+ #
107
+ # Check if index file is outdated.
108
+ #
109
+ # @return [Boolean] true if older than 24 hours
110
+ #
111
+ def outdated?
112
+ (Time.now - File.mtime(@file)) / 3600 > 24
113
+ end
114
+
115
+ #
116
+ # Fetch index from GitHub.
117
+ #
118
+ # @return [Array<Hash>] index
119
+ #
120
+ def fetch_index
121
+ url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
122
+ zip = Zip::InputStream.new URI(url).open
123
+ yaml = zip.get_next_entry.get_input_stream.read
124
+ idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
125
+ serialize_and_save idx
126
+ idx
127
+ rescue OpenURI::HTTPError => e
128
+ warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
129
+ []
130
+ end
131
+ end
132
+ end