relaton-iso 1.18.1 → 1.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a483674664dc939f3ccf8b46d36e275db5426d9c164e0ec1a799ea321cb47694
4
- data.tar.gz: 207870aaa9b1d89dc5dddd5922aac0b1c77c43b8bd0be292b2ffda7ccaeb712b
3
+ metadata.gz: 395d9fbaf99042f03785dd5b529d1a1821b8e5939a378503b6af4898901363d5
4
+ data.tar.gz: 4745ddb2a95d4b5dfc2290afdc2116c0ed9f113cda3c717909ca9987eec33486
5
5
  SHA512:
6
- metadata.gz: 39b94b85564592a94934e449d92ebc7fe5b90d536cba83947d179791b57fda3618a47853e1776822498e2654fa910c2ddc71b126c539655794917d790bd8f56a
7
- data.tar.gz: f34353074f0e0eb354b14b029726a964845fffbcf059bf6b9c5ef6e6a2cf0921dc84a25afc092c2b2da4088d3b833392c03dd20054b83195f40b16485a9ac04c
6
+ metadata.gz: fb0e270a99fc7a4a8cd07bf0f28d7c5d157976ffaf436523dd9ff871950726af552447f4e4eea1802eb0a59856702fd9ca95757b1b7fea3a89f69e5f83d3f876
7
+ data.tar.gz: 35d93ed7fdead4846059a22485ff56652b727640d754a2e4523350706966107418c3fb723eb278abaf17460f80612394dc61bf9720907294d821cacbc84e6b7f
@@ -0,0 +1,200 @@
1
+ module RelatonIso
2
+ # Fetch all the documents from ISO website.
3
+ class DataFetcher
4
+ #
5
+ # Initialize data fetcher.
6
+ #
7
+ # @param [String] output output directory
8
+ # @param [String] format format of output files (yaml, bibxml, xml)
9
+ #
10
+ def initialize(output, format)
11
+ @output = output
12
+ @format = format
13
+ @ext = format.sub(/^bib/, "")
14
+ @files = []
15
+ @queue = ::Queue.new
16
+ @mutex = Mutex.new
17
+ end
18
+
19
+ def index
20
+ @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
21
+ end
22
+
23
+ def iso_queue
24
+ @iso_queue ||= RelatonIso::Queue.new
25
+ end
26
+
27
+ #
28
+ # Initialize data fetcher and fetch data.
29
+ #
30
+ # @param [String] output output directory (default: "data")
31
+ # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
32
+ #
33
+ # @return [void]
34
+ #
35
+ def self.fetch(output: "data", format: "yaml")
36
+ t1 = Time.now
37
+ puts "Started at: #{t1}"
38
+ FileUtils.mkdir_p output
39
+ new(output, format).fetch
40
+ t2 = Time.now
41
+ puts "Stopped at: #{t2}"
42
+ puts "Done in: #{(t2 - t1).round} sec."
43
+ end
44
+
45
+ #
46
+ # Go through all ICS and fetch all documents.
47
+ #
48
+ # @return [void]
49
+ #
50
+ def fetch # rubocop:disable Metrics/AbcSize
51
+ puts "Scrapping ICS pages..."
52
+ fetch_ics
53
+ puts "[#{Time.now}] Scrapping documents..."
54
+ fetch_docs
55
+ iso_queue.save
56
+ # index.sort! { |a, b| compare_docids a, b }
57
+ index.save
58
+ end
59
+
60
+ #
61
+ # Fetch ICS page recursively and store all the links to documents in the iso_queue.
62
+ #
63
+ # @param [String] path path to ICS page
64
+ #
65
+ def fetch_ics
66
+ threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
67
+ fetch_ics_page "/standards-catalogue/browse-by-ics.html"
68
+ sleep(1) until @queue.empty?
69
+ threads.size.times { @queue << :END }
70
+ threads.each(&:join)
71
+ end
72
+
73
+ def fetch_ics_page(path)
74
+ resp = get_redirection path
75
+ page = Nokogiri::HTML(resp.body)
76
+ page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
77
+ iso_queue.add_first item[:href].split("?").first
78
+ end
79
+
80
+ page.xpath("//td[@data-title='ICS']/a").each do |item|
81
+ @queue << item[:href]
82
+ end
83
+ end
84
+
85
+ #
86
+ # Get the page from the given path. If the page is redirected, get the
87
+ # page from the new path.
88
+ #
89
+ # @param [String] path path to the page
90
+ #
91
+ # @return [Net::HTTPOK] HTTP response
92
+ #
93
+ def get_redirection(path) # rubocop:disable Metrics/MethodLength
94
+ try = 0
95
+ uri = URI(Scrapper::DOMAIN + path)
96
+ begin
97
+ get_response uri
98
+ rescue Net::OpenTimeout, Net::ReadTimeout => e
99
+ try += 1
100
+ retry if check_try try, uri
101
+
102
+ warn "Error fetching #{uri}"
103
+ warn e.message
104
+ end
105
+ end
106
+
107
+ def get_response(uri)
108
+ resp = Net::HTTP.get_response(uri)
109
+ resp.code == "302" ? get_redirection(resp["location"]) : resp
110
+ end
111
+
112
+ def check_try(try, uri)
113
+ if try < 3
114
+ warn "Timeout fetching #{uri}, retrying..."
115
+ sleep 1
116
+ true
117
+ end
118
+ end
119
+
120
+ def fetch_docs
121
+ threads = Array.new(3) { thread { |path| fetch_doc(path) } }
122
+ iso_queue[0..10_000].each { |docpath| @queue << docpath }
123
+ threads.size.times { @queue << :END }
124
+ threads.each(&:join)
125
+ end
126
+
127
+ #
128
+ # Fetch document from ISO website.
129
+ #
130
+ # @param [String] docpath document page path
131
+ #
132
+ # @return [void]
133
+ #
134
+ def fetch_doc(docpath)
135
+ # path = docpath.sub(/\.html$/, "")
136
+ # hit = Hit.new({ path: docpath }, nil)
137
+ doc = Scrapper.parse_page docpath
138
+ @mutex.synchronize { save_doc doc, docpath }
139
+ rescue StandardError => e
140
+ warn "Error fetching document: #{Scrapper::DOMAIN}#{docpath}"
141
+ warn e.message
142
+ warn e.backtrace
143
+ end
144
+
145
+ # def compare_docids(id1, id2)
146
+ # Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
147
+ # end
148
+
149
+ #
150
+ # save document to file.
151
+ #
152
+ # @param [RelatonIsoBib::IsoBibliographicItem] doc document
153
+ #
154
+ # @return [void]
155
+ #
156
+ def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
157
+ docid = doc.docidentifier.detect(&:primary)
158
+ file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
159
+ file = File.join @output, "#{file_name}.#{@ext}"
160
+ if @files.include? file
161
+ warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
162
+ else
163
+ @files << file
164
+ index.add_or_update docid.to_h, file
165
+ File.write file, serialize(doc), encoding: "UTF-8"
166
+ end
167
+ iso_queue.move_last docpath
168
+ end
169
+
170
+ #
171
+ # Serialize document to string.
172
+ #
173
+ # @param [RelatonIsoBib::IsoBibliographicItem] doc document
174
+ #
175
+ # @return [String] serialized document
176
+ #
177
+ def serialize(doc)
178
+ case @format
179
+ when "yaml" then doc.to_hash.to_yaml
180
+ when "bibxml" then doc.to_bibxml
181
+ when "xml" then doc.to_xml bibdata: true
182
+ end
183
+ end
184
+
185
+ private
186
+
187
+ #
188
+ # Create thread worker
189
+ #
190
+ # @return [Thread] thread
191
+ #
192
+ def thread
193
+ Thread.new do
194
+ while (path = @queue.pop) != :END
195
+ yield path
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -1,6 +1,6 @@
1
1
  module RelatonIso
2
2
  class DocumentIdentifier < RelatonBib::DocumentIdentifier
3
- def id
3
+ def id # rubocop:disable Metrics/MethodLength
4
4
  id_str = @id.to_s.sub(/\sED\d+/, "").squeeze(" ").sub(/^ISO\/\s/, "ISO ") # workarounds for pubid gem bugs
5
5
  if @all_parts
6
6
  if type == "URN"
@@ -10,6 +10,12 @@ module RelatonIso
10
10
  end
11
11
  end
12
12
  type == "URN" ? @id.urn.to_s : id_str
13
+ rescue Pubid::Iso::Errors::NoEditionError => e
14
+ Util.warn "WARNING: #{type} identifier can't be generated for #{@id}: #{e.message}"
15
+ end
16
+
17
+ def to_h
18
+ stringify_values(@id.to_h) if @id.respond_to? :to_h
13
19
  end
14
20
 
15
21
  def remove_part
@@ -23,5 +29,18 @@ module RelatonIso
23
29
  def all_parts
24
30
  @all_parts = true
25
31
  end
32
+
33
+ def stringify_values(hash)
34
+ hash.transform_values { |v| stringify(v) }.reject { |_k, v| v.empty? }
35
+ end
36
+
37
+ def stringify(val)
38
+ case val
39
+ when Array then val.map { |i| i.is_a?(Hash) ? stringify_values(i) : i.to_s }
40
+ when Hash then stringify_values(val)
41
+ when Symbol then val
42
+ else val.to_s
43
+ end
44
+ end
26
45
  end
27
46
  end
@@ -0,0 +1,15 @@
1
+ module RelatonIso
2
+ module HashConverter
3
+ include RelatonIsoBib::HashConverter
4
+ extend self
5
+
6
+ def create_docid(**args)
7
+ begin
8
+ args[:id] = Pubid::Iso::Identifier.parse args[:id] if args[:id].is_a?(String) && args[:primary]
9
+ rescue StandardError
10
+ Util.warn "Unable to create a Pubid::Iso::Identifier from `#{args[:id]}`"
11
+ end
12
+ DocumentIdentifier.new(**args)
13
+ end
14
+ end
15
+ end
@@ -4,28 +4,29 @@ module RelatonIso
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # @return [RelatonIsoBib::IsoBibliographicItem]
7
- attr_writer :fetch, :pubid
7
+ attr_writer :fetch
8
+
9
+ # @return [Pubid::Iso::Identifier] pubid
10
+ attr_writer :pubid
8
11
 
9
12
  # Update edition for pubid when provided in Bibliographic Item
10
- def update_edition(bibliographic_item)
11
- if bibliographic_item.edition
12
- # add edition to base document if available
13
- if pubid.base
14
- pubid.base.edition = bibliographic_item.edition.content
15
- else
16
- pubid.edition = bibliographic_item.edition.content
17
- end
18
- end
19
- end
13
+ # def update_edition(bibliographic_item)
14
+ # if bibliographic_item.edition
15
+ # pubid.root.edition = bibliographic_item.edition.content
16
+ # end
17
+ # end
20
18
 
21
19
  # Parse page.
22
20
  # @param lang [String, nil]
23
21
  # @return [RelatonIso::IsoBibliographicItem]
24
- def fetch(lang = nil)
25
- @fetch ||= Scrapper.parse_page self, lang
26
- # update edition for pubid using fetched data
27
- update_edition(@fetch)
28
- @fetch
22
+ def fetch(_lang = nil)
23
+ @fetch ||= begin
24
+ url = "#{HitCollection::ENDPOINT}#{hit[:file]}"
25
+ resp = Net::HTTP.get_response URI(url)
26
+ hash = YAML.safe_load resp.body
27
+ hash["fetched"] = Date.today.to_s
28
+ RelatonIsoBib::IsoBibliographicItem.from_hash hash
29
+ end
29
30
  end
30
31
 
31
32
  # @return [Integer]
@@ -41,11 +42,18 @@ module RelatonIso
41
42
 
42
43
  # @return [Pubid::Iso::Identifier]
43
44
  def pubid
44
- @pubid ||= Pubid::Iso::Identifier.parse_from_title(hit[:title])
45
- rescue Pubid::Iso::Errors::WrongTypeError,
46
- Pubid::Iso::Errors::ParseError => e
47
- Util.warn "Unable to find an identifier in: `#{hit[:title]}`."
48
- Util.warn e.message
45
+ return @pubid if defined? @pubid
46
+
47
+ create_pubid hit[:id]
48
+ rescue StandardError
49
+ Util.warn "Unable to create an identifier from #{hit[:id]}"
50
+ @pubid = nil
51
+ end
52
+
53
+ private
54
+
55
+ def create_pubid(id)
56
+ @pubid = id.is_a?(Hash) ? Pubid::Iso::Identifier.create(**id) : id
49
57
  end
50
58
  end
51
59
  end
@@ -6,82 +6,97 @@ require "relaton_iso/hit"
6
6
  module RelatonIso
7
7
  # Page of hit collection.
8
8
  class HitCollection < RelatonBib::HitCollection
9
- # @return [Boolean] whether the search was performed on GitHub
10
- attr_reader :from_gh
9
+ INDEXFILE = "index-v1.yaml"
10
+ ENDPOINT = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/"
11
11
 
12
- # @param text [String] reference to search
13
- def initialize(text)
12
+ # @param text [Pubid::Iso::Identifier] reference to search
13
+ def initialize(pubid, opts = {})
14
14
  super
15
- @from_gh = text.match?(/^ISO[\s\/](?:TC\s184\/SC\s?4|IEC\sDIR\s(?:\d|IEC|JTC))/)
15
+ @opts = opts
16
16
  end
17
17
 
18
- def fetch
19
- @array = from_gh ? fetch_github : fetch_iso
18
+ # @return [Pubid::Iso::Identifier]
19
+ alias ref_pubid text
20
+
21
+ def ref_pubid_no_year
22
+ @ref_pubid_no_year ||= ref_pubid.dup.tap { |r| r.base = r.base.exclude(:year) if r.base }
23
+ end
24
+
25
+ def ref_pubid_excluded
26
+ @ref_pubid_excluded ||= ref_pubid_no_year.exclude(*excludings)
27
+ end
28
+
29
+ def fetch # rubocop:disable Metrics/AbcSize
30
+ @array = index.search do |row|
31
+ row[:id].is_a?(Hash) ? pubid_match?(row[:id]) : ref_pubid.to_s == row[:id]
32
+ end.map { |row| Hit.new row, self }
33
+ .sort_by! { |h| h.pubid.to_s }
34
+ .reverse!
20
35
  self
21
36
  end
22
37
 
23
- # @param lang [String, NilClass]
38
+ def pubid_match?(id)
39
+ pubid = create_pubid(id)
40
+ return false unless pubid
41
+
42
+ pubid.base = pubid.base.exclude(:year, :edition) if pubid.base
43
+ dir_excludings = excludings.dup
44
+ dir_excludings << :edition unless pubid.typed_stage_abbrev == "DIR"
45
+ pubid.exclude(*dir_excludings) == ref_pubid_excluded
46
+ end
47
+
48
+ def create_pubid(id)
49
+ Pubid::Iso::Identifier.create(**id)
50
+ rescue StandardError => e
51
+ Util.warn "(#{ref_pubid}) WARNING: #{e.message}"
52
+ nil
53
+ end
54
+
55
+ def excludings
56
+ return @excludings if defined? @excludings
57
+
58
+ excl_parts = %i[year]
59
+ excl_parts << :part if ref_pubid.root.part.nil? || @opts[:all_parts]
60
+ if ref_pubid.stage.nil? || @opts[:all_parts]
61
+ excl_parts << :stage
62
+ excl_parts << :iteration
63
+ end
64
+ # excl_parts << :edition if ref_pubid.root.edition.nil? || all_parts
65
+ @escludings = excl_parts
66
+ end
67
+
68
+ def index
69
+ @index ||= Relaton::Index.find_or_create :iso, url: "#{ENDPOINT}index-v1.zip", file: INDEXFILE
70
+ end
71
+
72
+ def fetch_doc
73
+ if !@opts[:all_parts] || size == 1
74
+ any? && first.fetch(@opts[:lang])
75
+ else
76
+ to_all_parts(@opts[:lang])
77
+ end
78
+ end
79
+
80
+ # @param lang [String, nil]
24
81
  # @return [RelatonIsoBib::IsoBibliographicItem, nil]
25
- def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
26
- # parts = @array.reject { |h| h.hit["docPart"]&.empty? }
82
+ def to_all_parts(lang = nil) # rubocop:disable Metrics/AbcSize
27
83
  hit = @array.min_by { |h| h.pubid.part.to_i }
28
84
  return @array.first&.fetch lang unless hit
29
85
 
30
86
  bibitem = hit.fetch(lang)
31
87
  all_parts_item = bibitem.to_all_parts
32
- @array.reject { |h| h.hit[:uuid] == hit.hit[:uuid] }.each do |hi|
33
- isobib = RelatonIsoBib::IsoBibliographicItem.new(
34
- formattedref: RelatonBib::FormattedRef.new(content: hi.pubid.to_s),
35
- )
36
- all_parts_item.relation << RelatonBib::DocumentRelation.new(
37
- type: "instanceOf", bibitem: isobib,
38
- )
88
+ @array.reject { |h| h.pubid.part == hit.pubid.part }.each do |hi|
89
+ all_parts_item.relation << create_relation(hi)
39
90
  end
40
91
  all_parts_item
41
92
  end
42
93
 
43
- private
44
-
45
- #
46
- # Fetch document from GitHub repository
47
- #
48
- # @return [Array<RelatonIso::Hit]
49
- #
50
- def fetch_github # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
51
- ref = text.gsub(/[\s\/]/, "_").upcase
52
- url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/main/data/#{ref}.yaml"
53
- resp = Net::HTTP.get_response URI(url)
54
- return [] unless resp.code == "200"
55
-
56
- hash = YAML.safe_load resp.body
57
- bib_hash = RelatonIsoBib::HashConverter.hash_to_bib hash
58
- bib_hash[:fetched] = Date.today.to_s
59
- bib = RelatonIsoBib::IsoBibliographicItem.new(**bib_hash)
60
- hit = Hit.new({ title: text }, self)
61
- hit.fetch = bib
62
- [hit]
63
- end
64
-
65
- #
66
- # Fetch hits from iso.org
67
- #
68
- # @return [Array<RelatonIso::Hit>]
69
- #
70
- def fetch_iso # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
71
- config = Algolia::Search::Config.new(application_id: "JCL49WV5AR", api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0")
72
- client = Algolia::Search::Client.new config, logger: RelatonIso.configuration.logger
73
- index = client.init_index "all_en"
74
- resp = index.search text, hitsPerPage: 100, filters: "category:standard"
75
-
76
- resp[:hits].map { |h| Hit.new h, self }.sort! do |a, b|
77
- if a.sort_weight == b.sort_weight && b.hit[:year] = a.hit[:year]
78
- a.hit[:title] <=> b.hit[:title]
79
- elsif a.sort_weight == b.sort_weight
80
- b.hit[:year] - a.hit[:year]
81
- else
82
- a.sort_weight - b.sort_weight
83
- end
84
- end
94
+ def create_relation(hit)
95
+ docid = DocumentIdentifier.new(id: hit.pubid, type: "ISO", primary: true)
96
+ isobib = RelatonIsoBib::IsoBibliographicItem.new(
97
+ formattedref: RelatonBib::FormattedRef.new(content: hit.pubid.to_s), docid: [docid],
98
+ )
99
+ RelatonBib::DocumentRelation.new(type: "instanceOf", bibitem: isobib)
85
100
  end
86
101
  end
87
102
  end
@@ -0,0 +1,132 @@
1
+ module RelatonIso
2
+ # Index.
3
+ class Index
4
+ #
5
+ # Initialise index. If file path is given, read index from file. If file is not
6
+ # given, look for it in a `/home/USER/.relaton/iso` directory. If file
7
+ # doesn't exist, or is outdated then fetch index from GitHub.
8
+ #
9
+ # @param [String, nil] file path to index file.
10
+ #
11
+ def initialize(file = nil)
12
+ @file = file
13
+ end
14
+
15
+ #
16
+ # Create index.
17
+ #
18
+ # @return [Array<Hash>] index
19
+ #
20
+ def index
21
+ @index ||= read_index || read_from_user_dir || fetch_index
22
+ end
23
+
24
+ #
25
+ # Add or update index entry.
26
+ #
27
+ # @param [RelatonIsoBib::IsoBibliographicItem] item document
28
+ #
29
+ # @return [void]
30
+ #
31
+ def <<(item)
32
+ id = item.docidentifier.detect(&:primary).id
33
+ row = self[id] || begin
34
+ r = { id: id }
35
+ index << r
36
+ r
37
+ end
38
+ row[:title] = item.title.first.title.content
39
+ end
40
+
41
+ #
42
+ # Fetch document from index by ID.
43
+ #
44
+ # @param [String] id document ID
45
+ #
46
+ # @return [Hash] index entry
47
+ #
48
+ def [](id)
49
+ index.detect { |i| i[:id] == id }
50
+ end
51
+
52
+ #
53
+ # Save index to file.
54
+ #
55
+ # @return [void]
56
+ #
57
+ def save
58
+ serialize_and_save index
59
+ end
60
+
61
+ private
62
+
63
+ #
64
+ # Serialize index and save to file.
65
+ #
66
+ # @param [Array<Hash>] idx index
67
+ #
68
+ # @return [void]
69
+ #
70
+ def serialize_and_save(idx)
71
+ File.open(@file, "w:UTF-8") do |f|
72
+ f.puts "---"
73
+ idx.each do |i|
74
+ f.puts i.transform_keys(&:to_s).to_yaml.sub("---\n", "")
75
+ end
76
+ end
77
+ end
78
+
79
+ #
80
+ # Read index from file. If file doesn't exist, create empty index.
81
+ #
82
+ # @return [Array<Hash>, nil] index
83
+ #
84
+ def read_index
85
+ if @file && File.exist?(@file) then read_file
86
+ elsif @file then []
87
+ end
88
+ end
89
+
90
+ #
91
+ # Read index from `/home/USER/.relaton/iso` or fetch it from GitHub,
92
+ # if file doesn't exist, or is outdated.
93
+ #
94
+ # @return [Array<Hash>] index
95
+ #
96
+ def read_from_user_dir
97
+ @file = File.join(Dir.home, "index.yml")
98
+ read_file if File.exist?(@file) && !outdated?
99
+ end
100
+
101
+ def read_file
102
+ yaml = File.read @file, encoding: "UTF-8"
103
+ RelatonBib.parse_yaml yaml, [], symbolize_names: true
104
+ end
105
+
106
+ #
107
+ # Check if index file is outdated.
108
+ #
109
+ # @return [Boolean] true if older than 24 hours
110
+ #
111
+ def outdated?
112
+ (Time.now - File.mtime(@file)) / 3600 > 24
113
+ end
114
+
115
+ #
116
+ # Fetch index from GitHub.
117
+ #
118
+ # @return [Array<Hash>] index
119
+ #
120
+ def fetch_index
121
+ url = "https://raw.githubusercontent.com/relaton/relaton-data-iso/master/iso/index.zip"
122
+ zip = Zip::InputStream.new URI(url).open
123
+ yaml = zip.get_next_entry.get_input_stream.read
124
+ idx = RelatonBib.parse_yaml yaml, [], symbolize_names: true
125
+ serialize_and_save idx
126
+ idx
127
+ rescue OpenURI::HTTPError => e
128
+ warn "[relaton-iso] WARNING: failed to fetch index: #{e.message}"
129
+ []
130
+ end
131
+ end
132
+ end