relaton-jis 1.19.1 → 1.19.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +18 -0
- data/lib/relaton_jis/bibliography.rb +5 -9
- data/lib/relaton_jis/data_fetcher.rb +156 -0
- data/lib/relaton_jis/hash_converter.rb +1 -1
- data/lib/relaton_jis/hit.rb +11 -4
- data/lib/relaton_jis/processor.rb +20 -0
- data/lib/relaton_jis/scraper.rb +6 -4
- data/lib/relaton_jis/version.rb +1 -1
- data/lib/relaton_jis.rb +2 -0
- data/relaton_jis.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8b816ec16d7873fe9c8379b12dbe5f48f2874a50a5e113fdb439abaa6eb0a081
|
|
4
|
+
data.tar.gz: 94d2b6a1be560ec226f95da9eb6c8973af93fa983eb86ceb5cbaba8b72c3fbc3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f255822a0bc60c34d5b6d1074dd4b280b7626d64084710fe78a19495b8d1f6a9fe55ba2afa396ad7227024d6207c4de63ee904b8e395276e3eadbd626d9e1e93
|
|
7
|
+
data.tar.gz: f8b91c43a6f9d2a8a84da5f4812a135c841e0aa909938c30a6c6d1e885b338dfb83e95303d42fbd189918606d93154393f2d521ed5eea7d1cbb2adaf79f7b395
|
data/README.adoc
CHANGED
|
@@ -149,6 +149,24 @@ item.link
|
|
|
149
149
|
@type="pdf">]
|
|
150
150
|
----
|
|
151
151
|
|
|
152
|
+
=== Fetch data
|
|
153
|
+
|
|
154
|
+
This gem scrapes the https://webdesk.jsa.or.jp/books/W11M0270 pages to fetch the JIS Standards metadata. By default the data is saved in the `./data` folder in YAML format.
|
|
155
|
+
|
|
156
|
+
The method `RelatonJis::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the dataset and saves them to the `./data` folder in YAML format.
|
|
157
|
+
Arguments:
|
|
158
|
+
|
|
159
|
+
- `output` - folder to save documents (default './data').
|
|
160
|
+
- `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
|
|
161
|
+
|
|
162
|
+
[source,ruby]
|
|
163
|
+
----
|
|
164
|
+
RelatonJis::DataFetcher.fetch
|
|
165
|
+
Start fetching JIS data at 2024-09-27 17:49:40 -0400
|
|
166
|
+
Fetching JIS data finished at 2024-09-27 18:40:11 -0400. It took 3031.0 seconds.
|
|
167
|
+
=> nil
|
|
168
|
+
----
|
|
169
|
+
|
|
152
170
|
=== Logging
|
|
153
171
|
|
|
154
172
|
RelatonJis uses the relaton-logger gem for logging. By default, it logs to STDOUT. To change the log levels and add other loggers, read the https://github.com/relaton/relaton-logger#usage[relaton-logger] documentation.
|
|
@@ -2,7 +2,8 @@ module RelatonJis
|
|
|
2
2
|
module Bibliography
|
|
3
3
|
extend self
|
|
4
4
|
|
|
5
|
-
SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
|
|
5
|
+
# SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
|
|
6
|
+
GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-jis/refs/heads/main/".freeze
|
|
6
7
|
|
|
7
8
|
#
|
|
8
9
|
# Search JIS by keyword
|
|
@@ -13,14 +14,9 @@ module RelatonJis
|
|
|
13
14
|
# @return [RelatonJis::HitCollection] search result
|
|
14
15
|
#
|
|
15
16
|
def search(code, year = nil)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# raise RelatonBib::RequestError, "No results found for #{code}" if disp["disp_screen"].nil?
|
|
20
|
-
return unless disp["status"]
|
|
21
|
-
|
|
22
|
-
result = agent.get "#{SOURCE}0070/index"
|
|
23
|
-
HitCollection.new code, year, result: result.xpath("//div[@class='blockGenaral']")
|
|
17
|
+
index = Relaton::Index.find_or_create(:jis, url: "#{GH_URL}index-v1.zip", file: DataFetcher::INDEX_FILE)
|
|
18
|
+
result = index.search(code).sort_by { |h| h[:id] }
|
|
19
|
+
HitCollection.new code, year, result: result # .xpath("//div[@class='blockGenaral']")
|
|
24
20
|
end
|
|
25
21
|
|
|
26
22
|
#
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
module RelatonJis
|
|
2
|
+
class DataFetcher
|
|
3
|
+
URL = "https://webdesk.jsa.or.jp/books/".freeze
|
|
4
|
+
INDEX_FILE = "index-v1.yaml".freeze
|
|
5
|
+
|
|
6
|
+
def initialize(output, format)
|
|
7
|
+
@output = output
|
|
8
|
+
@format = format
|
|
9
|
+
@ext = format.sub("bibxml", "xml")
|
|
10
|
+
@files = Set.new
|
|
11
|
+
@queue = SizedQueue.new 10
|
|
12
|
+
@threads = create_thread_pool 5
|
|
13
|
+
@mutex = Mutex.new
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def self.fetch(output: "data", format: "yaml")
|
|
17
|
+
start_time = Time.now
|
|
18
|
+
puts "Start fetching JIS data at #{start_time}"
|
|
19
|
+
FileUtils.mkdir_p output
|
|
20
|
+
new(output, format).fetch
|
|
21
|
+
stop_time = Time.now
|
|
22
|
+
puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds."
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def create_thread_pool(size)
|
|
26
|
+
Array.new(size) do
|
|
27
|
+
Thread.new do
|
|
28
|
+
until (url = @queue.shift) == :END
|
|
29
|
+
fetch_doc url
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def fetch_doc(url) # rubocop:disable Metrics/MethodLength
|
|
36
|
+
attempts = 0
|
|
37
|
+
begin
|
|
38
|
+
bib = Scraper.new(url).fetch
|
|
39
|
+
rescue StandardError => e
|
|
40
|
+
attempts += 1
|
|
41
|
+
if attempts < 5
|
|
42
|
+
sleep 2
|
|
43
|
+
retry
|
|
44
|
+
else
|
|
45
|
+
Util.warn "URL: #{url}"
|
|
46
|
+
Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
|
|
47
|
+
end
|
|
48
|
+
else
|
|
49
|
+
save_doc bib, url
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def fetch
|
|
54
|
+
return unless initial_post
|
|
55
|
+
|
|
56
|
+
resp = agent.get "#{URL}W11M0070/index"
|
|
57
|
+
parse_page resp
|
|
58
|
+
index.save
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def initial_post
|
|
62
|
+
return true if @initial_time && Time.now - @initial_time < 600
|
|
63
|
+
|
|
64
|
+
body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 }
|
|
65
|
+
resp = agent.post "#{URL}W11M0270/index", body
|
|
66
|
+
disp = JSON.parse resp.body
|
|
67
|
+
@initial_time = Time.now
|
|
68
|
+
disp["status"] || Util.warn("No results found for JIS")
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def agent
|
|
72
|
+
@agent ||= Mechanize.new
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def parse_page(resp)
|
|
76
|
+
while resp
|
|
77
|
+
resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] }
|
|
78
|
+
offset = parse_offset resp
|
|
79
|
+
break if offset >= count # no more pages
|
|
80
|
+
|
|
81
|
+
resp = get_next_page(offset)
|
|
82
|
+
end
|
|
83
|
+
end_threads_and_wait
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def parse_offset(resp) # rubocop:disable Metrics/AbcSize
|
|
87
|
+
if resp.at('//*[@id="btnPaging"]') # first page
|
|
88
|
+
@count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1]
|
|
89
|
+
resp.at("//*[@id='offset']")[:value].to_i
|
|
90
|
+
else
|
|
91
|
+
script = resp.at("//script").text
|
|
92
|
+
script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def end_threads_and_wait
|
|
97
|
+
@threads.size.times { @queue << :END }
|
|
98
|
+
@queue.close
|
|
99
|
+
@threads.each(&:join)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def count
|
|
103
|
+
@count.to_i
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def get_next_page(offset) # rubocop:disable Metrics/MethodLength
|
|
107
|
+
attempts = 0
|
|
108
|
+
begin
|
|
109
|
+
if initial_post
|
|
110
|
+
agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset
|
|
111
|
+
end
|
|
112
|
+
rescue StandardError => e
|
|
113
|
+
attempts += 1
|
|
114
|
+
if attempts < 5
|
|
115
|
+
sleep 2
|
|
116
|
+
retry
|
|
117
|
+
else
|
|
118
|
+
Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
|
|
124
|
+
return unless bib
|
|
125
|
+
|
|
126
|
+
id = bib.docidentifier.find(&:primary).id
|
|
127
|
+
file = file id
|
|
128
|
+
@mutex.synchronize do
|
|
129
|
+
if @files.include?(file)
|
|
130
|
+
Util.warn "File #{file} already exists. Duplication URL: #{url}"
|
|
131
|
+
else
|
|
132
|
+
@files << file
|
|
133
|
+
File.write file, serialize(bib), encoding: "UTF-8"
|
|
134
|
+
index.add_or_update id, file
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def index
|
|
140
|
+
@index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def file(id)
|
|
144
|
+
name = id.gsub(/[:\/\s]/, "_")
|
|
145
|
+
File.join @output, "#{name}.#{@ext}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def serialize(bib)
|
|
149
|
+
case @format
|
|
150
|
+
when "yaml" then bib.to_hash.to_yaml
|
|
151
|
+
when "xml" then bib.to_xml bibdata: true
|
|
152
|
+
else bib.send "to_#{@format}"
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
data/lib/relaton_jis/hit.rb
CHANGED
|
@@ -8,9 +8,9 @@ module RelatonJis
|
|
|
8
8
|
#
|
|
9
9
|
# @return [RelatonJis::Hit] new hit
|
|
10
10
|
#
|
|
11
|
-
def self.create(
|
|
12
|
-
a = node.at("./a")
|
|
13
|
-
hit = { id: a.at("./text()").text.strip, url: a["href"] }
|
|
11
|
+
def self.create(hit, collection)
|
|
12
|
+
# a = node.at("./a")
|
|
13
|
+
# hit = { id: a.at("./text()").text.strip, url: a["href"] }
|
|
14
14
|
new hit, collection
|
|
15
15
|
end
|
|
16
16
|
|
|
@@ -44,7 +44,14 @@ module RelatonJis
|
|
|
44
44
|
end
|
|
45
45
|
|
|
46
46
|
def fetch
|
|
47
|
-
@fetch
|
|
47
|
+
return @fetch if defined? @fetch
|
|
48
|
+
|
|
49
|
+
# @fetch = Scraper.new(hit[:url]).fetch
|
|
50
|
+
yaml = Mechanize.new.get("#{Bibliography::GH_URL}#{hit[:file]}").body
|
|
51
|
+
hash = YAML.safe_load yaml
|
|
52
|
+
@fetch = RelatonJis::BibliographicItem.from_hash hash
|
|
53
|
+
@fetch.fetched = Date.today.to_s
|
|
54
|
+
@fetch
|
|
48
55
|
end
|
|
49
56
|
end
|
|
50
57
|
end
|
|
@@ -9,6 +9,7 @@ module RelatonJis
|
|
|
9
9
|
@prefix = "JIS"
|
|
10
10
|
@defaultprefix = %r{^(JIS|TR)\s}
|
|
11
11
|
@idtype = "JIS"
|
|
12
|
+
@datasets = %w[jis-webdesk]
|
|
12
13
|
end
|
|
13
14
|
|
|
14
15
|
# @param code [String]
|
|
@@ -19,6 +20,18 @@ module RelatonJis
|
|
|
19
20
|
::RelatonJis::Bibliography.get(code, date, opts)
|
|
20
21
|
end
|
|
21
22
|
|
|
23
|
+
#
|
|
24
|
+
# Fetch all the docukents from a source
|
|
25
|
+
#
|
|
26
|
+
# @param [String] _source source name
|
|
27
|
+
# @param [Hash] opts
|
|
28
|
+
# @option opts [String] :output directory to output documents
|
|
29
|
+
# @option opts [String] :format
|
|
30
|
+
#
|
|
31
|
+
def fetch_data(_source, opts)
|
|
32
|
+
DataFetcher.fetch(**opts)
|
|
33
|
+
end
|
|
34
|
+
|
|
22
35
|
# @param xml [String]
|
|
23
36
|
# @return [RelatonJis::BibliographicItem]
|
|
24
37
|
def from_xml(xml)
|
|
@@ -37,5 +50,12 @@ module RelatonJis
|
|
|
37
50
|
def grammar_hash
|
|
38
51
|
@grammar_hash ||= ::RelatonJis.grammar_hash
|
|
39
52
|
end
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# Remove index file
|
|
56
|
+
#
|
|
57
|
+
def remove_index_file
|
|
58
|
+
Relaton::Index.find_or_create(:jis, url: true, file: DataFetcher::INDEX_FILE).remove_file
|
|
59
|
+
end
|
|
40
60
|
end
|
|
41
61
|
end
|
data/lib/relaton_jis/scraper.rb
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
1
3
|
module RelatonJis
|
|
2
4
|
class Scraper
|
|
3
5
|
ATTRS = %i[
|
|
4
|
-
|
|
6
|
+
title link abstract docid docnumber date type language script
|
|
5
7
|
docstatus doctype ics contributor editorialgroup structuredidentifier
|
|
6
8
|
].freeze
|
|
7
9
|
|
|
@@ -24,9 +26,9 @@ module RelatonJis
|
|
|
24
26
|
BibliographicItem.new(**attrs)
|
|
25
27
|
end
|
|
26
28
|
|
|
27
|
-
def fetch_fetched
|
|
28
|
-
|
|
29
|
-
end
|
|
29
|
+
# def fetch_fetched
|
|
30
|
+
# Date.today.to_s
|
|
31
|
+
# end
|
|
30
32
|
|
|
31
33
|
def fetch_title
|
|
32
34
|
{ "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
|
data/lib/relaton_jis/version.rb
CHANGED
data/lib/relaton_jis.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "mechanize"
|
|
4
4
|
require "relaton_iso_bib"
|
|
5
|
+
require "relaton/index"
|
|
5
6
|
require_relative "relaton_jis/version"
|
|
6
7
|
require_relative "relaton_jis/util"
|
|
7
8
|
require_relative "relaton_jis/document_type"
|
|
@@ -12,6 +13,7 @@ require_relative "relaton_jis/scraper"
|
|
|
12
13
|
require_relative "relaton_jis/bibliography"
|
|
13
14
|
require_relative "relaton_jis/hit_collection"
|
|
14
15
|
require_relative "relaton_jis/hit"
|
|
16
|
+
require_relative "relaton_jis/data_fetcher"
|
|
15
17
|
|
|
16
18
|
module RelatonJis
|
|
17
19
|
class Error < StandardError; end
|
data/relaton_jis.gemspec
CHANGED
|
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
|
|
|
35
35
|
|
|
36
36
|
# Uncomment to register a new dependency of your gem
|
|
37
37
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
38
|
+
spec.add_dependency "relaton-index", "~> 0.2.0"
|
|
38
39
|
spec.add_dependency "relaton-iso-bib", "~> 1.19.0"
|
|
39
40
|
|
|
40
41
|
# For more information and examples about making a new gem, check out our
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-jis
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.19.
|
|
4
|
+
version: 1.19.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-
|
|
11
|
+
date: 2024-09-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -24,6 +24,20 @@ dependencies:
|
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '2.10'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: relaton-index
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 0.2.0
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: 0.2.0
|
|
27
41
|
- !ruby/object:Gem::Dependency
|
|
28
42
|
name: relaton-iso-bib
|
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -60,6 +74,7 @@ files:
|
|
|
60
74
|
- lib/relaton_jis.rb
|
|
61
75
|
- lib/relaton_jis/bibliographic_item.rb
|
|
62
76
|
- lib/relaton_jis/bibliography.rb
|
|
77
|
+
- lib/relaton_jis/data_fetcher.rb
|
|
63
78
|
- lib/relaton_jis/document_type.rb
|
|
64
79
|
- lib/relaton_jis/hash_converter.rb
|
|
65
80
|
- lib/relaton_jis/hit.rb
|