relaton-jis 1.19.1 → 1.19.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +18 -0
- data/lib/relaton_jis/bibliography.rb +5 -9
- data/lib/relaton_jis/data_fetcher.rb +156 -0
- data/lib/relaton_jis/hash_converter.rb +1 -1
- data/lib/relaton_jis/hit.rb +11 -4
- data/lib/relaton_jis/processor.rb +20 -0
- data/lib/relaton_jis/scraper.rb +6 -4
- data/lib/relaton_jis/version.rb +1 -1
- data/lib/relaton_jis.rb +2 -0
- data/relaton_jis.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b816ec16d7873fe9c8379b12dbe5f48f2874a50a5e113fdb439abaa6eb0a081
|
4
|
+
data.tar.gz: 94d2b6a1be560ec226f95da9eb6c8973af93fa983eb86ceb5cbaba8b72c3fbc3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f255822a0bc60c34d5b6d1074dd4b280b7626d64084710fe78a19495b8d1f6a9fe55ba2afa396ad7227024d6207c4de63ee904b8e395276e3eadbd626d9e1e93
|
7
|
+
data.tar.gz: f8b91c43a6f9d2a8a84da5f4812a135c841e0aa909938c30a6c6d1e885b338dfb83e95303d42fbd189918606d93154393f2d521ed5eea7d1cbb2adaf79f7b395
|
data/README.adoc
CHANGED
@@ -149,6 +149,24 @@ item.link
|
|
149
149
|
@type="pdf">]
|
150
150
|
----
|
151
151
|
|
152
|
+
=== Fetch data
|
153
|
+
|
154
|
+
This gem scrapes the https://webdesk.jsa.or.jp/books/W11M0270 pages to fetch the JIS Standards metadata. By default the data is saved in the `./data` folder in YAML format.
|
155
|
+
|
156
|
+
The method `RelatonJis::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the dataset and saves them to the `./data` folder in YAML format.
|
157
|
+
Arguments:
|
158
|
+
|
159
|
+
- `output` - folder to save documents (default './data').
|
160
|
+
- `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
|
161
|
+
|
162
|
+
[source,ruby]
|
163
|
+
----
|
164
|
+
RelatonJis::DataFetcher.fetch
|
165
|
+
Start fetching JIS data at 2024-09-27 17:49:40 -0400
|
166
|
+
Fetching JIS data finished at 2024-09-27 18:40:11 -0400. It took 3031.0 seconds.
|
167
|
+
=> nil
|
168
|
+
----
|
169
|
+
|
152
170
|
=== Logging
|
153
171
|
|
154
172
|
RelatonJis uses the relaton-logger gem for logging. By default, it logs to STDOUT. To change the log levels and add other loggers, read the https://github.com/relaton/relaton-logger#usage[relaton-logger] documentation.
|
@@ -2,7 +2,8 @@ module RelatonJis
|
|
2
2
|
module Bibliography
|
3
3
|
extend self
|
4
4
|
|
5
|
-
SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
|
5
|
+
# SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
|
6
|
+
GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-jis/refs/heads/main/".freeze
|
6
7
|
|
7
8
|
#
|
8
9
|
# Search JIS by keyword
|
@@ -13,14 +14,9 @@ module RelatonJis
|
|
13
14
|
# @return [RelatonJis::HitCollection] search result
|
14
15
|
#
|
15
16
|
def search(code, year = nil)
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
# raise RelatonBib::RequestError, "No results found for #{code}" if disp["disp_screen"].nil?
|
20
|
-
return unless disp["status"]
|
21
|
-
|
22
|
-
result = agent.get "#{SOURCE}0070/index"
|
23
|
-
HitCollection.new code, year, result: result.xpath("//div[@class='blockGenaral']")
|
17
|
+
index = Relaton::Index.find_or_create(:jis, url: "#{GH_URL}index-v1.zip", file: DataFetcher::INDEX_FILE)
|
18
|
+
result = index.search(code).sort_by { |h| h[:id] }
|
19
|
+
HitCollection.new code, year, result: result # .xpath("//div[@class='blockGenaral']")
|
24
20
|
end
|
25
21
|
|
26
22
|
#
|
@@ -0,0 +1,156 @@
|
|
1
|
+
module RelatonJis
|
2
|
+
class DataFetcher
|
3
|
+
URL = "https://webdesk.jsa.or.jp/books/".freeze
|
4
|
+
INDEX_FILE = "index-v1.yaml".freeze
|
5
|
+
|
6
|
+
def initialize(output, format)
|
7
|
+
@output = output
|
8
|
+
@format = format
|
9
|
+
@ext = format.sub("bibxml", "xml")
|
10
|
+
@files = Set.new
|
11
|
+
@queue = SizedQueue.new 10
|
12
|
+
@threads = create_thread_pool 5
|
13
|
+
@mutex = Mutex.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.fetch(output: "data", format: "yaml")
|
17
|
+
start_time = Time.now
|
18
|
+
puts "Start fetching JIS data at #{start_time}"
|
19
|
+
FileUtils.mkdir_p output
|
20
|
+
new(output, format).fetch
|
21
|
+
stop_time = Time.now
|
22
|
+
puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds."
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_thread_pool(size)
|
26
|
+
Array.new(size) do
|
27
|
+
Thread.new do
|
28
|
+
until (url = @queue.shift) == :END
|
29
|
+
fetch_doc url
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def fetch_doc(url) # rubocop:disable Metrics/MethodLength
|
36
|
+
attempts = 0
|
37
|
+
begin
|
38
|
+
bib = Scraper.new(url).fetch
|
39
|
+
rescue StandardError => e
|
40
|
+
attempts += 1
|
41
|
+
if attempts < 5
|
42
|
+
sleep 2
|
43
|
+
retry
|
44
|
+
else
|
45
|
+
Util.warn "URL: #{url}"
|
46
|
+
Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
|
47
|
+
end
|
48
|
+
else
|
49
|
+
save_doc bib, url
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def fetch
|
54
|
+
return unless initial_post
|
55
|
+
|
56
|
+
resp = agent.get "#{URL}W11M0070/index"
|
57
|
+
parse_page resp
|
58
|
+
index.save
|
59
|
+
end
|
60
|
+
|
61
|
+
def initial_post
|
62
|
+
return true if @initial_time && Time.now - @initial_time < 600
|
63
|
+
|
64
|
+
body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 }
|
65
|
+
resp = agent.post "#{URL}W11M0270/index", body
|
66
|
+
disp = JSON.parse resp.body
|
67
|
+
@initial_time = Time.now
|
68
|
+
disp["status"] || Util.warn("No results found for JIS")
|
69
|
+
end
|
70
|
+
|
71
|
+
def agent
|
72
|
+
@agent ||= Mechanize.new
|
73
|
+
end
|
74
|
+
|
75
|
+
def parse_page(resp)
|
76
|
+
while resp
|
77
|
+
resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] }
|
78
|
+
offset = parse_offset resp
|
79
|
+
break if offset >= count # no more pages
|
80
|
+
|
81
|
+
resp = get_next_page(offset)
|
82
|
+
end
|
83
|
+
end_threads_and_wait
|
84
|
+
end
|
85
|
+
|
86
|
+
def parse_offset(resp) # rubocop:disable Metrics/AbcSize
|
87
|
+
if resp.at('//*[@id="btnPaging"]') # first page
|
88
|
+
@count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1]
|
89
|
+
resp.at("//*[@id='offset']")[:value].to_i
|
90
|
+
else
|
91
|
+
script = resp.at("//script").text
|
92
|
+
script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def end_threads_and_wait
|
97
|
+
@threads.size.times { @queue << :END }
|
98
|
+
@queue.close
|
99
|
+
@threads.each(&:join)
|
100
|
+
end
|
101
|
+
|
102
|
+
def count
|
103
|
+
@count.to_i
|
104
|
+
end
|
105
|
+
|
106
|
+
def get_next_page(offset) # rubocop:disable Metrics/MethodLength
|
107
|
+
attempts = 0
|
108
|
+
begin
|
109
|
+
if initial_post
|
110
|
+
agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset
|
111
|
+
end
|
112
|
+
rescue StandardError => e
|
113
|
+
attempts += 1
|
114
|
+
if attempts < 5
|
115
|
+
sleep 2
|
116
|
+
retry
|
117
|
+
else
|
118
|
+
Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
|
124
|
+
return unless bib
|
125
|
+
|
126
|
+
id = bib.docidentifier.find(&:primary).id
|
127
|
+
file = file id
|
128
|
+
@mutex.synchronize do
|
129
|
+
if @files.include?(file)
|
130
|
+
Util.warn "File #{file} already exists. Duplication URL: #{url}"
|
131
|
+
else
|
132
|
+
@files << file
|
133
|
+
File.write file, serialize(bib), encoding: "UTF-8"
|
134
|
+
index.add_or_update id, file
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def index
|
140
|
+
@index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE
|
141
|
+
end
|
142
|
+
|
143
|
+
def file(id)
|
144
|
+
name = id.gsub(/[:\/\s]/, "_")
|
145
|
+
File.join @output, "#{name}.#{@ext}"
|
146
|
+
end
|
147
|
+
|
148
|
+
def serialize(bib)
|
149
|
+
case @format
|
150
|
+
when "yaml" then bib.to_hash.to_yaml
|
151
|
+
when "xml" then bib.to_xml bibdata: true
|
152
|
+
else bib.send "to_#{@format}"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
data/lib/relaton_jis/hit.rb
CHANGED
@@ -8,9 +8,9 @@ module RelatonJis
|
|
8
8
|
#
|
9
9
|
# @return [RelatonJis::Hit] new hit
|
10
10
|
#
|
11
|
-
def self.create(
|
12
|
-
a = node.at("./a")
|
13
|
-
hit = { id: a.at("./text()").text.strip, url: a["href"] }
|
11
|
+
def self.create(hit, collection)
|
12
|
+
# a = node.at("./a")
|
13
|
+
# hit = { id: a.at("./text()").text.strip, url: a["href"] }
|
14
14
|
new hit, collection
|
15
15
|
end
|
16
16
|
|
@@ -44,7 +44,14 @@ module RelatonJis
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def fetch
|
47
|
-
@fetch
|
47
|
+
return @fetch if defined? @fetch
|
48
|
+
|
49
|
+
# @fetch = Scraper.new(hit[:url]).fetch
|
50
|
+
yaml = Mechanize.new.get("#{Bibliography::GH_URL}#{hit[:file]}").body
|
51
|
+
hash = YAML.safe_load yaml
|
52
|
+
@fetch = RelatonJis::BibliographicItem.from_hash hash
|
53
|
+
@fetch.fetched = Date.today.to_s
|
54
|
+
@fetch
|
48
55
|
end
|
49
56
|
end
|
50
57
|
end
|
@@ -9,6 +9,7 @@ module RelatonJis
|
|
9
9
|
@prefix = "JIS"
|
10
10
|
@defaultprefix = %r{^(JIS|TR)\s}
|
11
11
|
@idtype = "JIS"
|
12
|
+
@datasets = %w[jis-webdesk]
|
12
13
|
end
|
13
14
|
|
14
15
|
# @param code [String]
|
@@ -19,6 +20,18 @@ module RelatonJis
|
|
19
20
|
::RelatonJis::Bibliography.get(code, date, opts)
|
20
21
|
end
|
21
22
|
|
23
|
+
#
|
24
|
+
# Fetch all the docukents from a source
|
25
|
+
#
|
26
|
+
# @param [String] _source source name
|
27
|
+
# @param [Hash] opts
|
28
|
+
# @option opts [String] :output directory to output documents
|
29
|
+
# @option opts [String] :format
|
30
|
+
#
|
31
|
+
def fetch_data(_source, opts)
|
32
|
+
DataFetcher.fetch(**opts)
|
33
|
+
end
|
34
|
+
|
22
35
|
# @param xml [String]
|
23
36
|
# @return [RelatonJis::BibliographicItem]
|
24
37
|
def from_xml(xml)
|
@@ -37,5 +50,12 @@ module RelatonJis
|
|
37
50
|
def grammar_hash
|
38
51
|
@grammar_hash ||= ::RelatonJis.grammar_hash
|
39
52
|
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Remove index file
|
56
|
+
#
|
57
|
+
def remove_index_file
|
58
|
+
Relaton::Index.find_or_create(:jis, url: true, file: DataFetcher::INDEX_FILE).remove_file
|
59
|
+
end
|
40
60
|
end
|
41
61
|
end
|
data/lib/relaton_jis/scraper.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
module RelatonJis
|
2
4
|
class Scraper
|
3
5
|
ATTRS = %i[
|
4
|
-
|
6
|
+
title link abstract docid docnumber date type language script
|
5
7
|
docstatus doctype ics contributor editorialgroup structuredidentifier
|
6
8
|
].freeze
|
7
9
|
|
@@ -24,9 +26,9 @@ module RelatonJis
|
|
24
26
|
BibliographicItem.new(**attrs)
|
25
27
|
end
|
26
28
|
|
27
|
-
def fetch_fetched
|
28
|
-
|
29
|
-
end
|
29
|
+
# def fetch_fetched
|
30
|
+
# Date.today.to_s
|
31
|
+
# end
|
30
32
|
|
31
33
|
def fetch_title
|
32
34
|
{ "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
|
data/lib/relaton_jis/version.rb
CHANGED
data/lib/relaton_jis.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require "mechanize"
|
4
4
|
require "relaton_iso_bib"
|
5
|
+
require "relaton/index"
|
5
6
|
require_relative "relaton_jis/version"
|
6
7
|
require_relative "relaton_jis/util"
|
7
8
|
require_relative "relaton_jis/document_type"
|
@@ -12,6 +13,7 @@ require_relative "relaton_jis/scraper"
|
|
12
13
|
require_relative "relaton_jis/bibliography"
|
13
14
|
require_relative "relaton_jis/hit_collection"
|
14
15
|
require_relative "relaton_jis/hit"
|
16
|
+
require_relative "relaton_jis/data_fetcher"
|
15
17
|
|
16
18
|
module RelatonJis
|
17
19
|
class Error < StandardError; end
|
data/relaton_jis.gemspec
CHANGED
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
|
|
35
35
|
|
36
36
|
# Uncomment to register a new dependency of your gem
|
37
37
|
spec.add_dependency "mechanize", "~> 2.10"
|
38
|
+
spec.add_dependency "relaton-index", "~> 0.2.0"
|
38
39
|
spec.add_dependency "relaton-iso-bib", "~> 1.19.0"
|
39
40
|
|
40
41
|
# For more information and examples about making a new gem, check out our
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-jis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.19.
|
4
|
+
version: 1.19.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '2.10'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: relaton-index
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.2.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.2.0
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: relaton-iso-bib
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -60,6 +74,7 @@ files:
|
|
60
74
|
- lib/relaton_jis.rb
|
61
75
|
- lib/relaton_jis/bibliographic_item.rb
|
62
76
|
- lib/relaton_jis/bibliography.rb
|
77
|
+
- lib/relaton_jis/data_fetcher.rb
|
63
78
|
- lib/relaton_jis/document_type.rb
|
64
79
|
- lib/relaton_jis/hash_converter.rb
|
65
80
|
- lib/relaton_jis/hit.rb
|