gbbib 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/Gemfile.lock +1 -1
- data/lib/gbbib/gb_bibliography.rb +121 -1
- data/lib/gbbib/hit.rb +2 -0
- data/lib/gbbib/hit_collection.rb +19 -19
- data/lib/gbbib/scrapper.rb +21 -14
- data/lib/gbbib/version.rb +1 -1
- data/lib/gbbib/workers_pool.rb +41 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: eb3b926e47fdfca5623a37e092d8072be696a8fe
|
4
|
+
data.tar.gz: 781ce62f34f01ea01ce1b1bea083ec851beb56e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c19520eb951344e11564dfba80bc16111ef331dc5b46e8bb37a31b1d3892ad56165ed90495eb82a1158e1034f6e05800dc7d40feced12c74f2e067a50e7ce173
|
7
|
+
data.tar.gz: 4b75cfa4a98a855a83800d7c84c207997f008a5d5ab80a655fa5d2098be19ae248dd255afce983b069a7d27d997aa1ac411cd26d26f5c7c31a327f11069f5776
|
data/Gemfile.lock
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'gbbib/workers_pool'
|
4
|
+
|
3
5
|
# GB bib module.
|
4
6
|
module Gbbib
|
5
7
|
# GB entry point class.
|
@@ -7,7 +9,7 @@ module Gbbib
|
|
7
9
|
class << self
|
8
10
|
# rubocop:disable Metrics/MethodLength
|
9
11
|
# @param text [Strin] code of standard for search
|
10
|
-
# @return [Gbbib::
|
12
|
+
# @return [Gbbib::HitCollection]
|
11
13
|
def search(text)
|
12
14
|
if text.match?(/^(GB|GJ|GS)/)
|
13
15
|
# Scrape national standards.
|
@@ -30,6 +32,124 @@ module Gbbib
|
|
30
32
|
end
|
31
33
|
end
|
32
34
|
# rubocop:enable Metrics/MethodLength
|
35
|
+
|
36
|
+
# @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
|
37
|
+
# @param year [String] the year the standard was published (optional)
|
38
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
|
39
|
+
# @return [String] Relaton XML serialisation of reference
|
40
|
+
def get(code, year, opts)
|
41
|
+
return iev.to_xml if code.casecmp? 'IEV'
|
42
|
+
code += '.1' if opts[:all_parts]
|
43
|
+
ret = get1(code, year, opts)
|
44
|
+
return nil if ret.nil?
|
45
|
+
ret.to_most_recent_reference unless year
|
46
|
+
ret.to_all_parts if opts[:all_parts]
|
47
|
+
ret.to_xml
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def fetch_ref_err(code, year, missed_years)
|
53
|
+
id = year ? "#{code}:#{year}" : code
|
54
|
+
warn "WARNING: no match found on the ISO website for #{id}. "\
|
55
|
+
"The code must be exactly like it is on the website."
|
56
|
+
warn "(There was no match for #{year}, though there were matches "\
|
57
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
58
|
+
if /\d-\d/.match? code
|
59
|
+
warn "The provided document part may not exist, or the document "\
|
60
|
+
"may no longer be published in parts."
|
61
|
+
else
|
62
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
63
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
64
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
65
|
+
end
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
def get1(code, year, opts)
|
70
|
+
return iev if code.casecmp? "IEV"
|
71
|
+
result = search_filter(code) or return nil
|
72
|
+
ret = results_filter(result, year)
|
73
|
+
return ret[:ret] if ret[:ret]
|
74
|
+
fetch_ref_err(code, year, ret[:years])
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_filter(code)
|
78
|
+
docidrx = %r{^[^\s]+\s[\d\.]+}
|
79
|
+
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
80
|
+
warn "fetching #{code}..."
|
81
|
+
result = search(code)
|
82
|
+
ret = result.select do |hit|
|
83
|
+
hit.title && hit.title.match(docidrx).to_s == code # &&
|
84
|
+
# !corrigrx.match?(hit.title)
|
85
|
+
end
|
86
|
+
return ret unless ret.empty?
|
87
|
+
[]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Sort through the results from Isobib, fetching them three at a time,
|
91
|
+
# and return the first result that matches the code,
|
92
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
93
|
+
# Only expects the first page of results to be populated.
|
94
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
95
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
96
|
+
def results_filter(result, year)
|
97
|
+
missed_years = []
|
98
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
99
|
+
fetch_pages(s, 3).each_with_index do |r, i|
|
100
|
+
return { ret: r } if !year
|
101
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
102
|
+
return { ret: r } if year.to_i == d.on.year
|
103
|
+
missed_years << d.on.year
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
{ years: missed_years }
|
108
|
+
end
|
109
|
+
|
110
|
+
def fetch_pages(s, n)
|
111
|
+
workers = WorkersPool.new n
|
112
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
113
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
114
|
+
workers.end
|
115
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
116
|
+
end
|
117
|
+
|
118
|
+
def iev
|
119
|
+
Nokogiri::XML.fragment(<<~"END")
|
120
|
+
<bibitem type="international-standard" id="IEV">
|
121
|
+
<title format="text/plain" language="en" script="Latn">Electropedia: The World's Online Electrotechnical Vocabulary</title>
|
122
|
+
<source type="src">http://www.electropedia.org</source>
|
123
|
+
<docidentifier>IEV</docidentifier>
|
124
|
+
<date type="published"> <on>#{Date.today.year}</on> </date>
|
125
|
+
<contributor>
|
126
|
+
<role type="publisher"/>
|
127
|
+
<organization>
|
128
|
+
<name>International Electrotechnical Commission</name>
|
129
|
+
<abbreviation>IEC</abbreviation>
|
130
|
+
<uri>www.iec.ch</uri>
|
131
|
+
</organization>
|
132
|
+
</contributor>
|
133
|
+
<language>en</language> <language>fr</language>
|
134
|
+
<script>Latn</script>
|
135
|
+
<copyright>
|
136
|
+
<from>#{Date.today.year}</from>
|
137
|
+
<owner>
|
138
|
+
<organization>
|
139
|
+
<name>International Electrotechnical Commission</name>
|
140
|
+
<abbreviation>IEC</abbreviation>
|
141
|
+
<uri>www.iec.ch</uri>
|
142
|
+
</organization>
|
143
|
+
</owner>
|
144
|
+
</copyright>
|
145
|
+
<relation type="updates">
|
146
|
+
<bibitem>
|
147
|
+
<formattedref>IEC 60050</formattedref>
|
148
|
+
</bibitem>
|
149
|
+
</relation>
|
150
|
+
</bibitem>
|
151
|
+
END
|
152
|
+
end
|
33
153
|
end
|
34
154
|
end
|
35
155
|
end
|
data/lib/gbbib/hit.rb
CHANGED
data/lib/gbbib/hit_collection.rb
CHANGED
@@ -21,25 +21,25 @@ module Gbbib
|
|
21
21
|
@hit_pages = hit_pages
|
22
22
|
end
|
23
23
|
|
24
|
-
# @return [
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
# @return [GbBib::HitCollection]
|
25
|
+
def fetch
|
26
|
+
workers = WorkersPool.new 4
|
27
|
+
workers.worker(&:fetch)
|
28
|
+
each do |hit|
|
29
|
+
workers << hit
|
30
|
+
end
|
31
|
+
workers.end
|
32
|
+
workers.result
|
33
|
+
@fetched = true
|
34
|
+
self
|
35
|
+
end
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
def to_s
|
38
|
+
inspect
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
|
43
|
+
end
|
44
44
|
end
|
45
45
|
end
|
data/lib/gbbib/scrapper.rb
CHANGED
@@ -14,18 +14,19 @@ module Gbbib
|
|
14
14
|
# @return [Hash]
|
15
15
|
def scrapped_data(doc, src:)
|
16
16
|
{
|
17
|
-
committee:
|
18
|
-
docid:
|
19
|
-
titles:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
17
|
+
committee: get_committee(doc),
|
18
|
+
docid: get_docid(doc),
|
19
|
+
titles: get_titles(doc),
|
20
|
+
contributors: get_contributors(doc),
|
21
|
+
type: get_type(doc),
|
22
|
+
docstatus: get_status(doc),
|
23
|
+
gbtype: get_gbtype(doc),
|
24
|
+
ccs: get_ccs(doc),
|
25
|
+
ics: get_ics(doc),
|
26
|
+
link: [{ type: 'src', content: src }],
|
27
|
+
dates: get_dates(doc),
|
28
|
+
language: ['zh'],
|
29
|
+
script: ['Hans']
|
29
30
|
}
|
30
31
|
end
|
31
32
|
# rubocop:enable Metrics/MethodLength
|
@@ -35,11 +36,17 @@ module Gbbib
|
|
35
36
|
# * :project_number [String]
|
36
37
|
# * :part_number [String]
|
37
38
|
def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
|
38
|
-
item_ref = doc.xpath(xpt)
|
39
|
-
.text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
|
39
|
+
item_ref = doc.xpath(xpt).text.match(/(?<=\s)(\d+)\.?((?<=\.)\d+|)/)
|
40
40
|
{ project_number: item_ref[1], part_number: item_ref[2] }
|
41
41
|
end
|
42
42
|
|
43
|
+
def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
|
44
|
+
name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
|
45
|
+
entity = IsoBibItem::Organization.new name: name, abbreviation: name
|
46
|
+
[{ entity: entity, roles: ['publisher'] }]
|
47
|
+
end
|
48
|
+
|
49
|
+
|
43
50
|
# @param doc [Nokogiri::HTML::Document]
|
44
51
|
# @return [Array<Hash>]
|
45
52
|
# * :title_intro [String]
|
data/lib/gbbib/version.rb
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Workers poll.
|
4
|
+
class WorkersPool
|
5
|
+
attr_accessor :nb_hits
|
6
|
+
|
7
|
+
def initialize(num_workers = 2)
|
8
|
+
@num_workers = num_workers < 2 ? 2 : num_workers
|
9
|
+
@queue = SizedQueue.new(num_workers * 2)
|
10
|
+
@result = []
|
11
|
+
@nb_hits = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def worker(&block)
|
15
|
+
@threads = Array.new @num_workers do
|
16
|
+
Thread.new do
|
17
|
+
until (item = @queue.pop) == :END
|
18
|
+
@result << yield(item) if block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def result
|
25
|
+
@threads.each(&:join)
|
26
|
+
@result
|
27
|
+
end
|
28
|
+
|
29
|
+
def <<(item)
|
30
|
+
@queue << item
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def end
|
35
|
+
@num_workers.times { @queue << :END }
|
36
|
+
end
|
37
|
+
|
38
|
+
def size
|
39
|
+
@result.size
|
40
|
+
end
|
41
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gbbib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-07-
|
11
|
+
date: 2018-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -154,6 +154,7 @@ files:
|
|
154
154
|
- lib/gbbib/sec_scrapper.rb
|
155
155
|
- lib/gbbib/t_scrapper.rb
|
156
156
|
- lib/gbbib/version.rb
|
157
|
+
- lib/gbbib/workers_pool.rb
|
157
158
|
- lib/gbbib/yaml/prefixes.yaml
|
158
159
|
homepage: https://github.com/riboseinc/gdbib
|
159
160
|
licenses:
|
@@ -175,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
176
|
version: '0'
|
176
177
|
requirements: []
|
177
178
|
rubyforge_project:
|
178
|
-
rubygems_version: 2.
|
179
|
+
rubygems_version: 2.6.12
|
179
180
|
signing_key:
|
180
181
|
specification_version: 4
|
181
182
|
summary: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the BibliographicItem
|