gbbib 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile.lock +1 -1
- data/lib/gbbib/gb_bibliography.rb +121 -1
- data/lib/gbbib/hit.rb +2 -0
- data/lib/gbbib/hit_collection.rb +19 -19
- data/lib/gbbib/scrapper.rb +21 -14
- data/lib/gbbib/version.rb +1 -1
- data/lib/gbbib/workers_pool.rb +41 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: eb3b926e47fdfca5623a37e092d8072be696a8fe
|
4
|
+
data.tar.gz: 781ce62f34f01ea01ce1b1bea083ec851beb56e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c19520eb951344e11564dfba80bc16111ef331dc5b46e8bb37a31b1d3892ad56165ed90495eb82a1158e1034f6e05800dc7d40feced12c74f2e067a50e7ce173
|
7
|
+
data.tar.gz: 4b75cfa4a98a855a83800d7c84c207997f008a5d5ab80a655fa5d2098be19ae248dd255afce983b069a7d27d997aa1ac411cd26d26f5c7c31a327f11069f5776
|
data/Gemfile.lock
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'gbbib/workers_pool'
|
4
|
+
|
3
5
|
# GB bib module.
|
4
6
|
module Gbbib
|
5
7
|
# GB entry point class.
|
@@ -7,7 +9,7 @@ module Gbbib
|
|
7
9
|
class << self
|
8
10
|
# rubocop:disable Metrics/MethodLength
|
9
11
|
# @param text [Strin] code of standard for search
|
10
|
-
# @return [Gbbib::
|
12
|
+
# @return [Gbbib::HitCollection]
|
11
13
|
def search(text)
|
12
14
|
if text.match?(/^(GB|GJ|GS)/)
|
13
15
|
# Scrape national standards.
|
@@ -30,6 +32,124 @@ module Gbbib
|
|
30
32
|
end
|
31
33
|
end
|
32
34
|
# rubocop:enable Metrics/MethodLength
|
35
|
+
|
36
|
+
# @param code [String] the GB standard Code to look up (e..g "GB/T 20223")
|
37
|
+
# @param year [String] the year the standard was published (optional)
|
38
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
|
39
|
+
# @return [String] Relaton XML serialisation of reference
|
40
|
+
def get(code, year, opts)
|
41
|
+
return iev.to_xml if code.casecmp? 'IEV'
|
42
|
+
code += '.1' if opts[:all_parts]
|
43
|
+
ret = get1(code, year, opts)
|
44
|
+
return nil if ret.nil?
|
45
|
+
ret.to_most_recent_reference unless year
|
46
|
+
ret.to_all_parts if opts[:all_parts]
|
47
|
+
ret.to_xml
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def fetch_ref_err(code, year, missed_years)
|
53
|
+
id = year ? "#{code}:#{year}" : code
|
54
|
+
warn "WARNING: no match found on the ISO website for #{id}. "\
|
55
|
+
"The code must be exactly like it is on the website."
|
56
|
+
warn "(There was no match for #{year}, though there were matches "\
|
57
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
58
|
+
if /\d-\d/.match? code
|
59
|
+
warn "The provided document part may not exist, or the document "\
|
60
|
+
"may no longer be published in parts."
|
61
|
+
else
|
62
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
63
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
64
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
65
|
+
end
|
66
|
+
nil
|
67
|
+
end
|
68
|
+
|
69
|
+
def get1(code, year, opts)
|
70
|
+
return iev if code.casecmp? "IEV"
|
71
|
+
result = search_filter(code) or return nil
|
72
|
+
ret = results_filter(result, year)
|
73
|
+
return ret[:ret] if ret[:ret]
|
74
|
+
fetch_ref_err(code, year, ret[:years])
|
75
|
+
end
|
76
|
+
|
77
|
+
def search_filter(code)
|
78
|
+
docidrx = %r{^[^\s]+\s[\d\.]+}
|
79
|
+
# corrigrx = %r{^[^\s]+\s[\d\.]+-[0-9]+/}
|
80
|
+
warn "fetching #{code}..."
|
81
|
+
result = search(code)
|
82
|
+
ret = result.select do |hit|
|
83
|
+
hit.title && hit.title.match(docidrx).to_s == code # &&
|
84
|
+
# !corrigrx.match?(hit.title)
|
85
|
+
end
|
86
|
+
return ret unless ret.empty?
|
87
|
+
[]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Sort through the results from Isobib, fetching them three at a time,
|
91
|
+
# and return the first result that matches the code,
|
92
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
93
|
+
# Only expects the first page of results to be populated.
|
94
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
95
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
96
|
+
def results_filter(result, year)
|
97
|
+
missed_years = []
|
98
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
99
|
+
fetch_pages(s, 3).each_with_index do |r, i|
|
100
|
+
return { ret: r } if !year
|
101
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
102
|
+
return { ret: r } if year.to_i == d.on.year
|
103
|
+
missed_years << d.on.year
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
{ years: missed_years }
|
108
|
+
end
|
109
|
+
|
110
|
+
def fetch_pages(s, n)
|
111
|
+
workers = WorkersPool.new n
|
112
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
113
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
114
|
+
workers.end
|
115
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
116
|
+
end
|
117
|
+
|
118
|
+
def iev
|
119
|
+
Nokogiri::XML.fragment(<<~"END")
|
120
|
+
<bibitem type="international-standard" id="IEV">
|
121
|
+
<title format="text/plain" language="en" script="Latn">Electropedia: The World's Online Electrotechnical Vocabulary</title>
|
122
|
+
<source type="src">http://www.electropedia.org</source>
|
123
|
+
<docidentifier>IEV</docidentifier>
|
124
|
+
<date type="published"> <on>#{Date.today.year}</on> </date>
|
125
|
+
<contributor>
|
126
|
+
<role type="publisher"/>
|
127
|
+
<organization>
|
128
|
+
<name>International Electrotechnical Commission</name>
|
129
|
+
<abbreviation>IEC</abbreviation>
|
130
|
+
<uri>www.iec.ch</uri>
|
131
|
+
</organization>
|
132
|
+
</contributor>
|
133
|
+
<language>en</language> <language>fr</language>
|
134
|
+
<script>Latn</script>
|
135
|
+
<copyright>
|
136
|
+
<from>#{Date.today.year}</from>
|
137
|
+
<owner>
|
138
|
+
<organization>
|
139
|
+
<name>International Electrotechnical Commission</name>
|
140
|
+
<abbreviation>IEC</abbreviation>
|
141
|
+
<uri>www.iec.ch</uri>
|
142
|
+
</organization>
|
143
|
+
</owner>
|
144
|
+
</copyright>
|
145
|
+
<relation type="updates">
|
146
|
+
<bibitem>
|
147
|
+
<formattedref>IEC 60050</formattedref>
|
148
|
+
</bibitem>
|
149
|
+
</relation>
|
150
|
+
</bibitem>
|
151
|
+
END
|
152
|
+
end
|
33
153
|
end
|
34
154
|
end
|
35
155
|
end
|
data/lib/gbbib/hit.rb
CHANGED
data/lib/gbbib/hit_collection.rb
CHANGED
@@ -21,25 +21,25 @@ module Gbbib
|
|
21
21
|
@hit_pages = hit_pages
|
22
22
|
end
|
23
23
|
|
24
|
-
# @return [
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
24
|
+
# @return [GbBib::HitCollection]
|
25
|
+
def fetch
|
26
|
+
workers = WorkersPool.new 4
|
27
|
+
workers.worker(&:fetch)
|
28
|
+
each do |hit|
|
29
|
+
workers << hit
|
30
|
+
end
|
31
|
+
workers.end
|
32
|
+
workers.result
|
33
|
+
@fetched = true
|
34
|
+
self
|
35
|
+
end
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
37
|
+
def to_s
|
38
|
+
inspect
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
|
43
|
+
end
|
44
44
|
end
|
45
45
|
end
|
data/lib/gbbib/scrapper.rb
CHANGED
@@ -14,18 +14,19 @@ module Gbbib
|
|
14
14
|
# @return [Hash]
|
15
15
|
def scrapped_data(doc, src:)
|
16
16
|
{
|
17
|
-
committee:
|
18
|
-
docid:
|
19
|
-
titles:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
17
|
+
committee: get_committee(doc),
|
18
|
+
docid: get_docid(doc),
|
19
|
+
titles: get_titles(doc),
|
20
|
+
contributors: get_contributors(doc),
|
21
|
+
type: get_type(doc),
|
22
|
+
docstatus: get_status(doc),
|
23
|
+
gbtype: get_gbtype(doc),
|
24
|
+
ccs: get_ccs(doc),
|
25
|
+
ics: get_ics(doc),
|
26
|
+
link: [{ type: 'src', content: src }],
|
27
|
+
dates: get_dates(doc),
|
28
|
+
language: ['zh'],
|
29
|
+
script: ['Hans']
|
29
30
|
}
|
30
31
|
end
|
31
32
|
# rubocop:enable Metrics/MethodLength
|
@@ -35,11 +36,17 @@ module Gbbib
|
|
35
36
|
# * :project_number [String]
|
36
37
|
# * :part_number [String]
|
37
38
|
def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
|
38
|
-
item_ref = doc.xpath(xpt)
|
39
|
-
.text.match(/(?<=\s)(\d+)-?((?<=-)\d+|)/)
|
39
|
+
item_ref = doc.xpath(xpt).text.match(/(?<=\s)(\d+)\.?((?<=\.)\d+|)/)
|
40
40
|
{ project_number: item_ref[1], part_number: item_ref[2] }
|
41
41
|
end
|
42
42
|
|
43
|
+
def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
|
44
|
+
name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
|
45
|
+
entity = IsoBibItem::Organization.new name: name, abbreviation: name
|
46
|
+
[{ entity: entity, roles: ['publisher'] }]
|
47
|
+
end
|
48
|
+
|
49
|
+
|
43
50
|
# @param doc [Nokogiri::HTML::Document]
|
44
51
|
# @return [Array<Hash>]
|
45
52
|
# * :title_intro [String]
|
data/lib/gbbib/version.rb
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Workers poll.
|
4
|
+
class WorkersPool
|
5
|
+
attr_accessor :nb_hits
|
6
|
+
|
7
|
+
def initialize(num_workers = 2)
|
8
|
+
@num_workers = num_workers < 2 ? 2 : num_workers
|
9
|
+
@queue = SizedQueue.new(num_workers * 2)
|
10
|
+
@result = []
|
11
|
+
@nb_hits = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def worker(&block)
|
15
|
+
@threads = Array.new @num_workers do
|
16
|
+
Thread.new do
|
17
|
+
until (item = @queue.pop) == :END
|
18
|
+
@result << yield(item) if block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def result
|
25
|
+
@threads.each(&:join)
|
26
|
+
@result
|
27
|
+
end
|
28
|
+
|
29
|
+
def <<(item)
|
30
|
+
@queue << item
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def end
|
35
|
+
@num_workers.times { @queue << :END }
|
36
|
+
end
|
37
|
+
|
38
|
+
def size
|
39
|
+
@result.size
|
40
|
+
end
|
41
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gbbib
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-07-
|
11
|
+
date: 2018-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -154,6 +154,7 @@ files:
|
|
154
154
|
- lib/gbbib/sec_scrapper.rb
|
155
155
|
- lib/gbbib/t_scrapper.rb
|
156
156
|
- lib/gbbib/version.rb
|
157
|
+
- lib/gbbib/workers_pool.rb
|
157
158
|
- lib/gbbib/yaml/prefixes.yaml
|
158
159
|
homepage: https://github.com/riboseinc/gdbib
|
159
160
|
licenses:
|
@@ -175,7 +176,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
175
176
|
version: '0'
|
176
177
|
requirements: []
|
177
178
|
rubyforge_project:
|
178
|
-
rubygems_version: 2.
|
179
|
+
rubygems_version: 2.6.12
|
179
180
|
signing_key:
|
180
181
|
specification_version: 4
|
181
182
|
summary: 'GdBib: retrieve Chinese GB Standards for bibliographic use using the BibliographicItem
|