isobib 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,378 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algoliasearch'
4
+ require 'iso_bib_item'
5
+ require 'isobib/hit'
6
+ require 'nokogiri'
7
+ require 'net/http'
8
+ require 'isobib/workers_pool'
9
+ # require 'isobib/iso_bibliographic_item'
10
+
11
+ # Capybara.request_driver :poltergeist do |app|
12
+ # Capybara::Poltergeist::Driver.new app, js_errors: false
13
+ # end
14
+ # Capybara.default_driver = :poltergeist
15
+
16
+ Algolia.init application_id: 'JCL49WV5AR',
17
+ api_key: 'dd1b9e1ab383f4d4817d29cd5e96d3f0'
18
+
19
+ module Isobib
20
+ # Scrapper.
21
+ # rubocop:disable Metrics/ModuleLength
22
+ module Scrapper
23
+ DOMAIN = 'https://www.iso.org'
24
+
25
+ TYPES = {
26
+ 'TS' => 'technicalSpecification',
27
+ 'TR' => 'technicalReport',
28
+ 'PAS' => 'publiclyAvailableSpecification',
29
+ 'AWI' => 'appruvedWorkItem',
30
+ 'CD' => 'committeeDraft',
31
+ 'FDIS' => 'finalDraftInternationalStandard',
32
+ 'NP' => 'newProposal',
33
+ 'DIS' => 'draftInternationalStandard',
34
+ 'WD' => 'workingDraft',
35
+ 'R' => 'recommendation',
36
+ 'Guide' => 'guide'
37
+ }.freeze
38
+
39
+ class << self
40
+ # @param text [String]
41
+ # @return [Array<Hash>]
42
+ def get(text)
43
+ iso_workers = WorkersPool.new 4
44
+ iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
45
+ algolia_workers = start_algolia_search(text, iso_workers)
46
+ iso_docs = iso_workers.result
47
+ algolia_workers.end
48
+ algolia_workers.result
49
+ iso_docs
50
+ end
51
+
52
+ # Parse page.
53
+ # @param hit [Hash]
54
+ # @return [Hash]
55
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
56
+ def parse_page(hit_data)
57
+ doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
58
+
59
+ # Fetch edition.
60
+ edition = doc.xpath("//strong[contains(text(), 'Edition')]/..")
61
+ .children.last.text.match(/\d+/).to_s
62
+
63
+ titles, abstract = fetch_titles_abstract(doc)
64
+
65
+ IsoBibItem::IsoBibliographicItem.new(
66
+ docid: fetch_docid(doc),
67
+ edition: edition,
68
+ language: langs(doc).map { |l| l[:lang] },
69
+ script: langs(doc).map { |l| script(l[:lang]) }.uniq,
70
+ titles: titles,
71
+ type: fetch_type(hit_data['title']),
72
+ docstatus: fetch_status(doc, hit_data['status']),
73
+ ics: fetch_ics(doc),
74
+ dates: fetch_dates(doc),
75
+ contributors: fetch_contributors(hit_data['title']),
76
+ workgroup: fetch_workgroup(doc),
77
+ abstract: abstract,
78
+ copyright: fetch_copyright(hit_data['title'], doc),
79
+ source: fetch_source(doc, url),
80
+ relations: fetch_relations(doc)
81
+ )
82
+ end
83
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
84
+
85
+ private
86
+
87
+ # Start algolia search workers.
88
+ # @param text[String]
89
+ # @param iso_workers [Isobib::WorkersPool]
90
+ # @reaturn [Isobib::WorkersPool]
91
+ def start_algolia_search(text, iso_workers)
92
+ index = Algolia::Index.new 'all_en'
93
+ algolia_workers = WorkersPool.new
94
+ algolia_workers.worker do |page|
95
+ algolia_worker(index, text, page, algolia_workers, iso_workers)
96
+ end
97
+
98
+ # Add first page so algolia worker will start.
99
+ algolia_workers << 0
100
+ end
101
+
102
+ # Fetch ISO documents.
103
+ # @param hit [Hash]
104
+ # @param isiso_workers [Isobib::WorkersPool]
105
+ def iso_worker(hit, iso_workers)
106
+ print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
107
+ parse_page hit
108
+ end
109
+
110
+ # Fetch hits from algolia search service.
111
+ # @param index[Algolia::Index]
112
+ # @param text [String]
113
+ # @param page [Integer]
114
+ # @param algolia_workers [Isobib::WorkersPool]
115
+ # @param isiso_workers [Isobib::WorkersPool]
116
+ def algolia_worker(index, text, page, algolia_workers, iso_workers)
117
+ res = index.search text, facetFilters: ['category:standard'], page: page
118
+ next_page = res['page'] + 1
119
+ algolia_workers << next_page if next_page < res['nbPages']
120
+ res['hits'].each do |hit|
121
+ iso_workers.nb_hits = res['nbHits']
122
+ iso_workers << hit
123
+ end
124
+ iso_workers.end unless next_page < res['nbPages']
125
+ end
126
+
127
+ # Fetch titles and abstracts.
128
+ # @param doc [Nokigiri::HTML::Document]
129
+ # @return [Array<Array>]
130
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
131
+ def fetch_titles_abstract(doc)
132
+ titles = []
133
+ abstract = []
134
+ langs(doc).each do |lang|
135
+ # Don't need to get page for en. We already have it.
136
+ d = lang[:path] ? get_page(lang[:path])[0] : doc
137
+
138
+ # Check if unavailable for the lang.
139
+ next if d.css('h5.help-block').any?
140
+ titles << fetch_title(d, lang[:lang])
141
+
142
+ # Fetch abstracts.
143
+ abstract_content = d.css("div[itemprop='description'] p").text
144
+ next if abstract_content.empty?
145
+ abstract << {
146
+ content: abstract_content,
147
+ language: lang[:lang],
148
+ script: script(lang[:lang])
149
+ }
150
+ end
151
+ [titles, abstract]
152
+ end
153
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
154
+
155
+ # Get langs.
156
+ # @param doc [Nokogiri::HTML::Document]
157
+ # @return [Array<Hash>]
158
+ def langs(doc)
159
+ lgs = [{ lang: 'en' }]
160
+ doc.css('ul#lang-switcher ul li a').each do |lang_link|
161
+ lang_path = lang_link.attr('href')
162
+ lang = lang_path.match(%r{^\/(fr)\/})
163
+ lgs << { lang: lang[1], path: lang_path } if lang
164
+ end
165
+ lgs
166
+ end
167
+
168
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
169
+ # Get page.
170
+ # @param path [String] page's path
171
+ # @return [Array<Nokogiri::HTML::Document, String>]
172
+ def get_page(path)
173
+ url = DOMAIN + path
174
+ uri = URI url
175
+ resp = Net::HTTP.get_response uri
176
+ if resp.code == '301'
177
+ path = resp['location']
178
+ url = DOMAIN + path
179
+ uri = URI url
180
+ resp = Net::HTTP.get_response uri
181
+ end
182
+ n = 0
183
+ while resp.body !~ /<strong/ && n < 10
184
+ resp = Net::HTTP.get_response uri
185
+ n += 1
186
+ end
187
+ [Nokogiri::HTML(resp.body), url]
188
+ end
189
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
190
+
191
+ # Fetch docid.
192
+ # @param doc [Nokogiri::HTML::Document]
193
+ # @return [Hash]
194
+ def fetch_docid(doc)
195
+ item_ref = doc.xpath("//strong[@id='itemReference']").text
196
+ .match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
197
+ { project_number: item_ref[1], part_number: item_ref[2] }
198
+ end
199
+
200
+ # Fetch status.
201
+ # @param doc [Nokogiri::HTML::Document]
202
+ # @param status [String]
203
+ # @return [Hash]
204
+ def fetch_status(doc, status)
205
+ stage, substage = doc.css('li.dropdown.active span.stage-code > strong')
206
+ .text.split '.'
207
+ { status: status, stage: stage, substage: substage }
208
+ end
209
+
210
+ # Fetch workgroup.
211
+ # @param doc [Nokogiri::HTML::Document]
212
+ # @return [Hash]
213
+ def fetch_workgroup(doc)
214
+ wg_link = doc.css('div.entry-name.entry-block a')[0]
215
+ # wg_url = DOMAIN + wg_link['href']
216
+ workgroup = wg_link.text.split '/'
217
+ { name: 'International Organization for Standardization',
218
+ abbreviation: 'ISO',
219
+ url: 'www.iso.org',
220
+ technical_committee: {
221
+ name: wg_link.text + doc.css('div.entry-title')[0].text,
222
+ type: 'technicalCommittee',
223
+ number: workgroup[1].match(/\d+/).to_s.to_i
224
+ } }
225
+ end
226
+
227
+ # Fetch relations.
228
+ # @param doc [Nokogiri::HTML::Document]
229
+ # @return [Array<Hash>]
230
+ # rubocop:disable Metrics/MethodLength
231
+ def fetch_relations(doc)
232
+ doc.css('ul.steps li').inject([]) do |a, r|
233
+ r_type = r.css('strong').text
234
+ type = case r_type
235
+ when 'Previously', 'Will be replaced by' then 'obsoletes'
236
+ when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
237
+ 'updates'
238
+ else r_type
239
+ end
240
+ if ['Now', 'Now under review'].include? type
241
+ a
242
+ else
243
+ a + r.css('a').map do |id|
244
+ { type: type, identifier: id.text, url: id['href'] }
245
+ end
246
+ end
247
+ end
248
+ end
249
+ # rubocop:enable Metrics/MethodLength
250
+
251
+ # Fetch type.
252
+ # @param title [String]
253
+ # @return [String]
254
+ def fetch_type(title)
255
+ type_match = title.match(%r{^(ISO|IWA|IEC)(?:\/IEC\s|\/IEEE\s|\/PRF\s|
256
+ \/NP\s|\s|\/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
257
+ if TYPES[type_match[2]]
258
+ TYPES[type_match[2]]
259
+ elsif type_match[1] == 'ISO'
260
+ 'international-standard'
261
+ elsif type_match[1] == 'IWA'
262
+ 'international-workshop-agreement'
263
+ end
264
+ # rescue => _e
265
+ # puts 'Unknown document type: ' + title
266
+ end
267
+
268
+ # Fetch titles.
269
+ # @param doc [Nokogiri::HTML::Document]
270
+ # @param lang [String]
271
+ # @return [Hash]
272
+ def fetch_title(doc, lang)
273
+ intro, main, part = doc.css("h3[itemprop='description']")
274
+ .text.split ' -- '
275
+ {
276
+ title_intro: intro,
277
+ title_main: main,
278
+ title_part: part,
279
+ language: lang,
280
+ script: script(lang)
281
+ }
282
+ end
283
+
284
+ # Return ISO script code.
285
+ # @param lang [String]
286
+ # @return [String]
287
+ def script(lang)
288
+ case lang
289
+ when 'en', 'fr' then 'Latn'
290
+ end
291
+ end
292
+
293
+ # Fetch dates
294
+ # @param doc [Nokogiri::HTML::Document]
295
+ # @return [Array<Hash>]
296
+ def fetch_dates(doc)
297
+ dates = []
298
+ publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
299
+ unless publish_date.empty?
300
+ dates << { type: 'published', from: publish_date }
301
+ end
302
+ dates
303
+ end
304
+
305
+ # rubocop:disable Metrics/MethodLength
306
+ def fetch_contributors(title)
307
+ title.sub(/\s.*/, '').split('/').map do |abbrev|
308
+ case abbrev
309
+ when 'ISO'
310
+ name = 'International Organization for Standardization'
311
+ url = 'www.iso.org'
312
+ when 'IEC'
313
+ name = 'International Electrotechnical Commission'
314
+ url = 'www.iec.ch'
315
+ end
316
+ { entity: { name: name, url: url, abbreviation: abbrev },
317
+ roles: ['publisher'] }
318
+ end
319
+ end
320
+ # rubocop:enable Metrics/MethodLength
321
+
322
+ # Fetch ICS.
323
+ # @param doc [Nokogiri::HTML::Document]
324
+ # @return [Array<Hash>]
325
+ def fetch_ics(doc)
326
+ doc.xpath('//strong[contains(text(), '\
327
+ "'ICS')]/../following-sibling::dd/div/a").map do |i|
328
+ code = i.text.match(/[\d\.]+/).to_s.split '.'
329
+ { field: code[0], group: code[1], subgroup: code[2] }
330
+ end
331
+ end
332
+
333
+ # Fetch sources.
334
+ # @param doc [Nokogiri::HTML::Document]
335
+ # @param url [String]
336
+ # @return [Array<Hash>]
337
+ def fetch_source(doc, url)
338
+ obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
339
+ obp = obp_elms.attr('href').value if obp_elms.any?
340
+ rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr('href')
341
+ .value
342
+ [
343
+ { type: 'src', content: url },
344
+ { type: 'obp', content: obp },
345
+ { type: 'rss', content: rss }
346
+ ]
347
+ end
348
+
349
+ # Fetch copyright.
350
+ # @param title [String]
351
+ # @return [Hash]
352
+ def fetch_copyright(title, doc)
353
+ owner_name = title.match(/.*?(?=\s)/).to_s
354
+ from = title.match(/(?<=:)\d{4}/).to_s
355
+ if from.empty?
356
+ from = doc.xpath("//span[@itemprop='releaseDate']").text
357
+ .match(/\d{4}/).to_s
358
+ end
359
+ { owner: { name: owner_name }, from: from }
360
+ end
361
+ end
362
+
363
+ # private
364
+ #
365
+ # def next_hits_page(next_page)
366
+ # page = @index.search @text, facetFilters: ['category:standard'],
367
+ # page: next_page
368
+ # page.each do |key, value|
369
+ # if key == 'hits'
370
+ # @docs[key] += value
371
+ # else
372
+ # @docs[key] = value
373
+ # end
374
+ # end
375
+ # end
376
+ end
377
+ # rubocop:enable Metrics/ModuleLength
378
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Isobib
4
+ VERSION = '0.1.2'
5
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Workers poll.
4
+ class WorkersPool
5
+ attr_accessor :nb_hits
6
+
7
+ def initialize(num_workers = 2)
8
+ @num_workers = num_workers < 2 ? 2 : num_workers
9
+ @queue = SizedQueue.new(num_workers * 2)
10
+ @result = []
11
+ @nb_hits = 0
12
+ end
13
+
14
+ def worker(&block)
15
+ @threads = Array.new @num_workers do
16
+ Thread.new do
17
+ until (item = @queue.pop) == :END
18
+ @result << yield(item) if block
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def result
25
+ @threads.each(&:join)
26
+ @result
27
+ end
28
+
29
+ def <<(item)
30
+ @queue << item
31
+ self
32
+ end
33
+
34
+ def end
35
+ @num_workers.times { @queue << :END }
36
+ end
37
+
38
+ def size
39
+ @result.size
40
+ end
41
+ end
data/lib/isobib.rb ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'isobib/version'
4
+ require 'isobib/iso_bibliography'
metadata ADDED
@@ -0,0 +1,174 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: isobib
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Ribose Inc.
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-05-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry-byebug
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: algoliasearch
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: iso-bib-item
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: 'IsoBib: retrieve ISO Standards for bibliographic use using the BibliographicItem
112
+ model'
113
+ email:
114
+ - open.source@ribose.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - ".gitignore"
120
+ - ".rspec"
121
+ - ".rubocop.yml"
122
+ - ".travis.yml"
123
+ - CODE_OF_CONDUCT.md
124
+ - Gemfile
125
+ - Gemfile.lock
126
+ - README.adoc
127
+ - Rakefile
128
+ - bin/bundle
129
+ - bin/byebug
130
+ - bin/coderay
131
+ - bin/console
132
+ - bin/htmldiff
133
+ - bin/httpclient
134
+ - bin/ldiff
135
+ - bin/nokogiri
136
+ - bin/pry
137
+ - bin/rake
138
+ - bin/rspec
139
+ - bin/setup
140
+ - isobib.gemspec
141
+ - lib/isobib.rb
142
+ - lib/isobib/hit.rb
143
+ - lib/isobib/hit_collection.rb
144
+ - lib/isobib/hit_pages.rb
145
+ - lib/isobib/iso_bibliography.rb
146
+ - lib/isobib/scrapper.rb
147
+ - lib/isobib/version.rb
148
+ - lib/isobib/workers_pool.rb
149
+ homepage: https://github.com/riboseinc/isobib
150
+ licenses:
151
+ - MIT
152
+ metadata: {}
153
+ post_install_message:
154
+ rdoc_options: []
155
+ require_paths:
156
+ - lib
157
+ required_ruby_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ required_rubygems_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ requirements: []
168
+ rubyforge_project:
169
+ rubygems_version: 2.6.12
170
+ signing_key:
171
+ specification_version: 4
172
+ summary: 'IsoBib: retrieve ISO Standards for bibliographic use using the BibliographicItem
173
+ model'
174
+ test_files: []