isobib 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,378 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'algoliasearch'
4
+ require 'iso_bib_item'
5
+ require 'isobib/hit'
6
+ require 'nokogiri'
7
+ require 'net/http'
8
+ require 'isobib/workers_pool'
9
+ # require 'isobib/iso_bibliographic_item'
10
+
11
+ # Capybara.request_driver :poltergeist do |app|
12
+ # Capybara::Poltergeist::Driver.new app, js_errors: false
13
+ # end
14
+ # Capybara.default_driver = :poltergeist
15
+
16
+ Algolia.init application_id: 'JCL49WV5AR',
17
+ api_key: 'dd1b9e1ab383f4d4817d29cd5e96d3f0'
18
+
19
+ module Isobib
20
+ # Scrapper.
21
+ # rubocop:disable Metrics/ModuleLength
22
+ module Scrapper
23
+ DOMAIN = 'https://www.iso.org'
24
+
25
+ TYPES = {
26
+ 'TS' => 'technicalSpecification',
27
+ 'TR' => 'technicalReport',
28
+ 'PAS' => 'publiclyAvailableSpecification',
29
+ 'AWI' => 'appruvedWorkItem',
30
+ 'CD' => 'committeeDraft',
31
+ 'FDIS' => 'finalDraftInternationalStandard',
32
+ 'NP' => 'newProposal',
33
+ 'DIS' => 'draftInternationalStandard',
34
+ 'WD' => 'workingDraft',
35
+ 'R' => 'recommendation',
36
+ 'Guide' => 'guide'
37
+ }.freeze
38
+
39
+ class << self
40
+ # @param text [String]
41
+ # @return [Array<Hash>]
42
+ def get(text)
43
+ iso_workers = WorkersPool.new 4
44
+ iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
45
+ algolia_workers = start_algolia_search(text, iso_workers)
46
+ iso_docs = iso_workers.result
47
+ algolia_workers.end
48
+ algolia_workers.result
49
+ iso_docs
50
+ end
51
+
52
+ # Parse page.
53
+ # @param hit [Hash]
54
+ # @return [Hash]
55
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
56
+ def parse_page(hit_data)
57
+ doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
58
+
59
+ # Fetch edition.
60
+ edition = doc.xpath("//strong[contains(text(), 'Edition')]/..")
61
+ .children.last.text.match(/\d+/).to_s
62
+
63
+ titles, abstract = fetch_titles_abstract(doc)
64
+
65
+ IsoBibItem::IsoBibliographicItem.new(
66
+ docid: fetch_docid(doc),
67
+ edition: edition,
68
+ language: langs(doc).map { |l| l[:lang] },
69
+ script: langs(doc).map { |l| script(l[:lang]) }.uniq,
70
+ titles: titles,
71
+ type: fetch_type(hit_data['title']),
72
+ docstatus: fetch_status(doc, hit_data['status']),
73
+ ics: fetch_ics(doc),
74
+ dates: fetch_dates(doc),
75
+ contributors: fetch_contributors(hit_data['title']),
76
+ workgroup: fetch_workgroup(doc),
77
+ abstract: abstract,
78
+ copyright: fetch_copyright(hit_data['title'], doc),
79
+ source: fetch_source(doc, url),
80
+ relations: fetch_relations(doc)
81
+ )
82
+ end
83
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
84
+
85
+ private
86
+
87
+ # Start algolia search workers.
88
+ # @param text[String]
89
+ # @param iso_workers [Isobib::WorkersPool]
90
+ # @reaturn [Isobib::WorkersPool]
91
+ def start_algolia_search(text, iso_workers)
92
+ index = Algolia::Index.new 'all_en'
93
+ algolia_workers = WorkersPool.new
94
+ algolia_workers.worker do |page|
95
+ algolia_worker(index, text, page, algolia_workers, iso_workers)
96
+ end
97
+
98
+ # Add first page so algolia worker will start.
99
+ algolia_workers << 0
100
+ end
101
+
102
+ # Fetch ISO documents.
103
+ # @param hit [Hash]
104
+ # @param isiso_workers [Isobib::WorkersPool]
105
+ def iso_worker(hit, iso_workers)
106
+ print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
107
+ parse_page hit
108
+ end
109
+
110
+ # Fetch hits from algolia search service.
111
+ # @param index[Algolia::Index]
112
+ # @param text [String]
113
+ # @param page [Integer]
114
+ # @param algolia_workers [Isobib::WorkersPool]
115
+ # @param isiso_workers [Isobib::WorkersPool]
116
+ def algolia_worker(index, text, page, algolia_workers, iso_workers)
117
+ res = index.search text, facetFilters: ['category:standard'], page: page
118
+ next_page = res['page'] + 1
119
+ algolia_workers << next_page if next_page < res['nbPages']
120
+ res['hits'].each do |hit|
121
+ iso_workers.nb_hits = res['nbHits']
122
+ iso_workers << hit
123
+ end
124
+ iso_workers.end unless next_page < res['nbPages']
125
+ end
126
+
127
+ # Fetch titles and abstracts.
128
+ # @param doc [Nokigiri::HTML::Document]
129
+ # @return [Array<Array>]
130
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
131
+ def fetch_titles_abstract(doc)
132
+ titles = []
133
+ abstract = []
134
+ langs(doc).each do |lang|
135
+ # Don't need to get page for en. We already have it.
136
+ d = lang[:path] ? get_page(lang[:path])[0] : doc
137
+
138
+ # Check if unavailable for the lang.
139
+ next if d.css('h5.help-block').any?
140
+ titles << fetch_title(d, lang[:lang])
141
+
142
+ # Fetch abstracts.
143
+ abstract_content = d.css("div[itemprop='description'] p").text
144
+ next if abstract_content.empty?
145
+ abstract << {
146
+ content: abstract_content,
147
+ language: lang[:lang],
148
+ script: script(lang[:lang])
149
+ }
150
+ end
151
+ [titles, abstract]
152
+ end
153
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
154
+
155
+ # Get langs.
156
+ # @param doc [Nokogiri::HTML::Document]
157
+ # @return [Array<Hash>]
158
+ def langs(doc)
159
+ lgs = [{ lang: 'en' }]
160
+ doc.css('ul#lang-switcher ul li a').each do |lang_link|
161
+ lang_path = lang_link.attr('href')
162
+ lang = lang_path.match(%r{^\/(fr)\/})
163
+ lgs << { lang: lang[1], path: lang_path } if lang
164
+ end
165
+ lgs
166
+ end
167
+
168
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
169
+ # Get page.
170
+ # @param path [String] page's path
171
+ # @return [Array<Nokogiri::HTML::Document, String>]
172
+ def get_page(path)
173
+ url = DOMAIN + path
174
+ uri = URI url
175
+ resp = Net::HTTP.get_response uri
176
+ if resp.code == '301'
177
+ path = resp['location']
178
+ url = DOMAIN + path
179
+ uri = URI url
180
+ resp = Net::HTTP.get_response uri
181
+ end
182
+ n = 0
183
+ while resp.body !~ /<strong/ && n < 10
184
+ resp = Net::HTTP.get_response uri
185
+ n += 1
186
+ end
187
+ [Nokogiri::HTML(resp.body), url]
188
+ end
189
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
190
+
191
+ # Fetch docid.
192
+ # @param doc [Nokogiri::HTML::Document]
193
+ # @return [Hash]
194
+ def fetch_docid(doc)
195
+ item_ref = doc.xpath("//strong[@id='itemReference']").text
196
+ .match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
197
+ { project_number: item_ref[1], part_number: item_ref[2] }
198
+ end
199
+
200
+ # Fetch status.
201
+ # @param doc [Nokogiri::HTML::Document]
202
+ # @param status [String]
203
+ # @return [Hash]
204
+ def fetch_status(doc, status)
205
+ stage, substage = doc.css('li.dropdown.active span.stage-code > strong')
206
+ .text.split '.'
207
+ { status: status, stage: stage, substage: substage }
208
+ end
209
+
210
+ # Fetch workgroup.
211
+ # @param doc [Nokogiri::HTML::Document]
212
+ # @return [Hash]
213
+ def fetch_workgroup(doc)
214
+ wg_link = doc.css('div.entry-name.entry-block a')[0]
215
+ # wg_url = DOMAIN + wg_link['href']
216
+ workgroup = wg_link.text.split '/'
217
+ { name: 'International Organization for Standardization',
218
+ abbreviation: 'ISO',
219
+ url: 'www.iso.org',
220
+ technical_committee: {
221
+ name: wg_link.text + doc.css('div.entry-title')[0].text,
222
+ type: 'technicalCommittee',
223
+ number: workgroup[1].match(/\d+/).to_s.to_i
224
+ } }
225
+ end
226
+
227
+ # Fetch relations.
228
+ # @param doc [Nokogiri::HTML::Document]
229
+ # @return [Array<Hash>]
230
+ # rubocop:disable Metrics/MethodLength
231
+ def fetch_relations(doc)
232
+ doc.css('ul.steps li').inject([]) do |a, r|
233
+ r_type = r.css('strong').text
234
+ type = case r_type
235
+ when 'Previously', 'Will be replaced by' then 'obsoletes'
236
+ when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
237
+ 'updates'
238
+ else r_type
239
+ end
240
+ if ['Now', 'Now under review'].include? type
241
+ a
242
+ else
243
+ a + r.css('a').map do |id|
244
+ { type: type, identifier: id.text, url: id['href'] }
245
+ end
246
+ end
247
+ end
248
+ end
249
+ # rubocop:enable Metrics/MethodLength
250
+
251
+ # Fetch type.
252
+ # @param title [String]
253
+ # @return [String]
254
+ def fetch_type(title)
255
+ type_match = title.match(%r{^(ISO|IWA|IEC)(?:\/IEC\s|\/IEEE\s|\/PRF\s|
256
+ \/NP\s|\s|\/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
257
+ if TYPES[type_match[2]]
258
+ TYPES[type_match[2]]
259
+ elsif type_match[1] == 'ISO'
260
+ 'international-standard'
261
+ elsif type_match[1] == 'IWA'
262
+ 'international-workshop-agreement'
263
+ end
264
+ # rescue => _e
265
+ # puts 'Unknown document type: ' + title
266
+ end
267
+
268
+ # Fetch titles.
269
+ # @param doc [Nokogiri::HTML::Document]
270
+ # @param lang [String]
271
+ # @return [Hash]
272
+ def fetch_title(doc, lang)
273
+ intro, main, part = doc.css("h3[itemprop='description']")
274
+ .text.split ' -- '
275
+ {
276
+ title_intro: intro,
277
+ title_main: main,
278
+ title_part: part,
279
+ language: lang,
280
+ script: script(lang)
281
+ }
282
+ end
283
+
284
+ # Return ISO script code.
285
+ # @param lang [String]
286
+ # @return [String]
287
+ def script(lang)
288
+ case lang
289
+ when 'en', 'fr' then 'Latn'
290
+ end
291
+ end
292
+
293
+ # Fetch dates
294
+ # @param doc [Nokogiri::HTML::Document]
295
+ # @return [Array<Hash>]
296
+ def fetch_dates(doc)
297
+ dates = []
298
+ publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
299
+ unless publish_date.empty?
300
+ dates << { type: 'published', from: publish_date }
301
+ end
302
+ dates
303
+ end
304
+
305
+ # rubocop:disable Metrics/MethodLength
306
+ def fetch_contributors(title)
307
+ title.sub(/\s.*/, '').split('/').map do |abbrev|
308
+ case abbrev
309
+ when 'ISO'
310
+ name = 'International Organization for Standardization'
311
+ url = 'www.iso.org'
312
+ when 'IEC'
313
+ name = 'International Electrotechnical Commission'
314
+ url = 'www.iec.ch'
315
+ end
316
+ { entity: { name: name, url: url, abbreviation: abbrev },
317
+ roles: ['publisher'] }
318
+ end
319
+ end
320
+ # rubocop:enable Metrics/MethodLength
321
+
322
+ # Fetch ICS.
323
+ # @param doc [Nokogiri::HTML::Document]
324
+ # @return [Array<Hash>]
325
+ def fetch_ics(doc)
326
+ doc.xpath('//strong[contains(text(), '\
327
+ "'ICS')]/../following-sibling::dd/div/a").map do |i|
328
+ code = i.text.match(/[\d\.]+/).to_s.split '.'
329
+ { field: code[0], group: code[1], subgroup: code[2] }
330
+ end
331
+ end
332
+
333
+ # Fetch sources.
334
+ # @param doc [Nokogiri::HTML::Document]
335
+ # @param url [String]
336
+ # @return [Array<Hash>]
337
+ def fetch_source(doc, url)
338
+ obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
339
+ obp = obp_elms.attr('href').value if obp_elms.any?
340
+ rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr('href')
341
+ .value
342
+ [
343
+ { type: 'src', content: url },
344
+ { type: 'obp', content: obp },
345
+ { type: 'rss', content: rss }
346
+ ]
347
+ end
348
+
349
+ # Fetch copyright.
350
+ # @param title [String]
351
+ # @return [Hash]
352
+ def fetch_copyright(title, doc)
353
+ owner_name = title.match(/.*?(?=\s)/).to_s
354
+ from = title.match(/(?<=:)\d{4}/).to_s
355
+ if from.empty?
356
+ from = doc.xpath("//span[@itemprop='releaseDate']").text
357
+ .match(/\d{4}/).to_s
358
+ end
359
+ { owner: { name: owner_name }, from: from }
360
+ end
361
+ end
362
+
363
+ # private
364
+ #
365
+ # def next_hits_page(next_page)
366
+ # page = @index.search @text, facetFilters: ['category:standard'],
367
+ # page: next_page
368
+ # page.each do |key, value|
369
+ # if key == 'hits'
370
+ # @docs[key] += value
371
+ # else
372
+ # @docs[key] = value
373
+ # end
374
+ # end
375
+ # end
376
+ end
377
+ # rubocop:enable Metrics/ModuleLength
378
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Isobib
4
+ VERSION = '0.1.2'
5
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Workers poll.
4
+ class WorkersPool
5
+ attr_accessor :nb_hits
6
+
7
+ def initialize(num_workers = 2)
8
+ @num_workers = num_workers < 2 ? 2 : num_workers
9
+ @queue = SizedQueue.new(num_workers * 2)
10
+ @result = []
11
+ @nb_hits = 0
12
+ end
13
+
14
+ def worker(&block)
15
+ @threads = Array.new @num_workers do
16
+ Thread.new do
17
+ until (item = @queue.pop) == :END
18
+ @result << yield(item) if block
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def result
25
+ @threads.each(&:join)
26
+ @result
27
+ end
28
+
29
+ def <<(item)
30
+ @queue << item
31
+ self
32
+ end
33
+
34
+ def end
35
+ @num_workers.times { @queue << :END }
36
+ end
37
+
38
+ def size
39
+ @result.size
40
+ end
41
+ end
data/lib/isobib.rb ADDED
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'isobib/version'
4
+ require 'isobib/iso_bibliography'
metadata ADDED
@@ -0,0 +1,174 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: isobib
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Ribose Inc.
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-05-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry-byebug
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: algoliasearch
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: iso-bib-item
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: 'IsoBib: retrieve ISO Standards for bibliographic use using the BibliographicItem
112
+ model'
113
+ email:
114
+ - open.source@ribose.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - ".gitignore"
120
+ - ".rspec"
121
+ - ".rubocop.yml"
122
+ - ".travis.yml"
123
+ - CODE_OF_CONDUCT.md
124
+ - Gemfile
125
+ - Gemfile.lock
126
+ - README.adoc
127
+ - Rakefile
128
+ - bin/bundle
129
+ - bin/byebug
130
+ - bin/coderay
131
+ - bin/console
132
+ - bin/htmldiff
133
+ - bin/httpclient
134
+ - bin/ldiff
135
+ - bin/nokogiri
136
+ - bin/pry
137
+ - bin/rake
138
+ - bin/rspec
139
+ - bin/setup
140
+ - isobib.gemspec
141
+ - lib/isobib.rb
142
+ - lib/isobib/hit.rb
143
+ - lib/isobib/hit_collection.rb
144
+ - lib/isobib/hit_pages.rb
145
+ - lib/isobib/iso_bibliography.rb
146
+ - lib/isobib/scrapper.rb
147
+ - lib/isobib/version.rb
148
+ - lib/isobib/workers_pool.rb
149
+ homepage: https://github.com/riboseinc/isobib
150
+ licenses:
151
+ - MIT
152
+ metadata: {}
153
+ post_install_message:
154
+ rdoc_options: []
155
+ require_paths:
156
+ - lib
157
+ required_ruby_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ required_rubygems_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ requirements: []
168
+ rubyforge_project:
169
+ rubygems_version: 2.6.12
170
+ signing_key:
171
+ specification_version: 4
172
+ summary: 'IsoBib: retrieve ISO Standards for bibliographic use using the BibliographicItem
173
+ model'
174
+ test_files: []