relaton-jis 1.19.1 → 1.19.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c223759d15403f399f92edd2272b156b0e3c7f6c7dc274babb6b24cf02f99326
4
- data.tar.gz: 387192eb0a06f3f0cca389decf2e0aa9d13193aed0f52d0e8949c7089ce77b99
3
+ metadata.gz: 8b816ec16d7873fe9c8379b12dbe5f48f2874a50a5e113fdb439abaa6eb0a081
4
+ data.tar.gz: 94d2b6a1be560ec226f95da9eb6c8973af93fa983eb86ceb5cbaba8b72c3fbc3
5
5
  SHA512:
6
- metadata.gz: a1ceb478a730fba7ce00809420161cc996b1ade2e3d590fab98c75668ee1450df0be2daebf1f5e3cedd8fe099121bd5e2ed01d73b56b5b755afee12e2a009302
7
- data.tar.gz: 1c7fe1151443babf282595293d6c41f5945fe75d56481f06edd60a5c00f61b15f9ff65a428de6827494f08df3a64f5eb620a6fdb2f1832f826d4de346dee28f7
6
+ metadata.gz: f255822a0bc60c34d5b6d1074dd4b280b7626d64084710fe78a19495b8d1f6a9fe55ba2afa396ad7227024d6207c4de63ee904b8e395276e3eadbd626d9e1e93
7
+ data.tar.gz: f8b91c43a6f9d2a8a84da5f4812a135c841e0aa909938c30a6c6d1e885b338dfb83e95303d42fbd189918606d93154393f2d521ed5eea7d1cbb2adaf79f7b395
data/README.adoc CHANGED
@@ -149,6 +149,24 @@ item.link
149
149
  @type="pdf">]
150
150
  ----
151
151
 
152
+ === Fetch data
153
+
154
+ This gem scrapes the https://webdesk.jsa.or.jp/books/W11M0270 pages to fetch the JIS Standards metadata. By default the data is saved in the `./data` folder in YAML format.
155
+
156
+ The method `RelatonJis::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the dataset and saves them to the `./data` folder in YAML format.
157
+ Arguments:
158
+
159
+ - `output` - folder to save documents (default './data').
160
+ - `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
161
+
162
+ [source,ruby]
163
+ ----
164
+ RelatonJis::DataFetcher.fetch
165
+ Start fetching JIS data at 2024-09-27 17:49:40 -0400
166
+ Fetching JIS data finished at 2024-09-27 18:40:11 -0400. It took 3031.0 seconds.
167
+ => nil
168
+ ----
169
+
152
170
  === Logging
153
171
 
154
172
  RelatonJis uses the relaton-logger gem for logging. By default, it logs to STDOUT. To change the log levels and add other loggers, read the https://github.com/relaton/relaton-logger#usage[relaton-logger] documentation.
@@ -2,7 +2,8 @@ module RelatonJis
2
2
  module Bibliography
3
3
  extend self
4
4
 
5
- SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
5
+ # SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
6
+ GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-jis/refs/heads/main/".freeze
6
7
 
7
8
  #
8
9
  # Search JIS by keyword
@@ -13,14 +14,9 @@ module RelatonJis
13
14
  # @return [RelatonJis::HitCollection] search result
14
15
  #
15
16
  def search(code, year = nil)
16
- agent = Mechanize.new
17
- resp = agent.post "#{SOURCE}0270/index", dantai: "JIS", bunsyo_id: code, searchtype2: "1", status_1: "1", status_2: "1"
18
- disp = JSON.parse resp.body
19
- # raise RelatonBib::RequestError, "No results found for #{code}" if disp["disp_screen"].nil?
20
- return unless disp["status"]
21
-
22
- result = agent.get "#{SOURCE}0070/index"
23
- HitCollection.new code, year, result: result.xpath("//div[@class='blockGenaral']")
17
+ index = Relaton::Index.find_or_create(:jis, url: "#{GH_URL}index-v1.zip", file: DataFetcher::INDEX_FILE)
18
+ result = index.search(code).sort_by { |h| h[:id] }
19
+ HitCollection.new code, year, result: result # .xpath("//div[@class='blockGenaral']")
24
20
  end
25
21
 
26
22
  #
@@ -0,0 +1,156 @@
1
+ module RelatonJis
2
+ class DataFetcher
3
+ URL = "https://webdesk.jsa.or.jp/books/".freeze
4
+ INDEX_FILE = "index-v1.yaml".freeze
5
+
6
+ def initialize(output, format)
7
+ @output = output
8
+ @format = format
9
+ @ext = format.sub("bibxml", "xml")
10
+ @files = Set.new
11
+ @queue = SizedQueue.new 10
12
+ @threads = create_thread_pool 5
13
+ @mutex = Mutex.new
14
+ end
15
+
16
+ def self.fetch(output: "data", format: "yaml")
17
+ start_time = Time.now
18
+ puts "Start fetching JIS data at #{start_time}"
19
+ FileUtils.mkdir_p output
20
+ new(output, format).fetch
21
+ stop_time = Time.now
22
+ puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds."
23
+ end
24
+
25
+ def create_thread_pool(size)
26
+ Array.new(size) do
27
+ Thread.new do
28
+ until (url = @queue.shift) == :END
29
+ fetch_doc url
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def fetch_doc(url) # rubocop:disable Metrics/MethodLength
36
+ attempts = 0
37
+ begin
38
+ bib = Scraper.new(url).fetch
39
+ rescue StandardError => e
40
+ attempts += 1
41
+ if attempts < 5
42
+ sleep 2
43
+ retry
44
+ else
45
+ Util.warn "URL: #{url}"
46
+ Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
47
+ end
48
+ else
49
+ save_doc bib, url
50
+ end
51
+ end
52
+
53
+ def fetch
54
+ return unless initial_post
55
+
56
+ resp = agent.get "#{URL}W11M0070/index"
57
+ parse_page resp
58
+ index.save
59
+ end
60
+
61
+ def initial_post
62
+ return true if @initial_time && Time.now - @initial_time < 600
63
+
64
+ body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 }
65
+ resp = agent.post "#{URL}W11M0270/index", body
66
+ disp = JSON.parse resp.body
67
+ @initial_time = Time.now
68
+ disp["status"] || Util.warn("No results found for JIS")
69
+ end
70
+
71
+ def agent
72
+ @agent ||= Mechanize.new
73
+ end
74
+
75
+ def parse_page(resp)
76
+ while resp
77
+ resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] }
78
+ offset = parse_offset resp
79
+ break if offset >= count # no more pages
80
+
81
+ resp = get_next_page(offset)
82
+ end
83
+ end_threads_and_wait
84
+ end
85
+
86
+ def parse_offset(resp) # rubocop:disable Metrics/AbcSize
87
+ if resp.at('//*[@id="btnPaging"]') # first page
88
+ @count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1]
89
+ resp.at("//*[@id='offset']")[:value].to_i
90
+ else
91
+ script = resp.at("//script").text
92
+ script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
93
+ end
94
+ end
95
+
96
+ def end_threads_and_wait
97
+ @threads.size.times { @queue << :END }
98
+ @queue.close
99
+ @threads.each(&:join)
100
+ end
101
+
102
+ def count
103
+ @count.to_i
104
+ end
105
+
106
+ def get_next_page(offset) # rubocop:disable Metrics/MethodLength
107
+ attempts = 0
108
+ begin
109
+ if initial_post
110
+ agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset
111
+ end
112
+ rescue StandardError => e
113
+ attempts += 1
114
+ if attempts < 5
115
+ sleep 2
116
+ retry
117
+ else
118
+ Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
119
+ end
120
+ end
121
+ end
122
+
123
+ def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
124
+ return unless bib
125
+
126
+ id = bib.docidentifier.find(&:primary).id
127
+ file = file id
128
+ @mutex.synchronize do
129
+ if @files.include?(file)
130
+ Util.warn "File #{file} already exists. Duplication URL: #{url}"
131
+ else
132
+ @files << file
133
+ File.write file, serialize(bib), encoding: "UTF-8"
134
+ index.add_or_update id, file
135
+ end
136
+ end
137
+ end
138
+
139
+ def index
140
+ @index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE
141
+ end
142
+
143
+ def file(id)
144
+ name = id.gsub(/[:\/\s]/, "_")
145
+ File.join @output, "#{name}.#{@ext}"
146
+ end
147
+
148
+ def serialize(bib)
149
+ case @format
150
+ when "yaml" then bib.to_hash.to_yaml
151
+ when "xml" then bib.to_xml bibdata: true
152
+ else bib.send "to_#{@format}"
153
+ end
154
+ end
155
+ end
156
+ end
@@ -1,6 +1,6 @@
1
1
  module RelatonJis
2
2
  module HashConverter
3
- include RelatonBib::HashConverter
3
+ include RelatonIsoBib::HashConverter
4
4
  extend self
5
5
 
6
6
  # @param item_hash [Hash]
@@ -8,9 +8,9 @@ module RelatonJis
8
8
  #
9
9
  # @return [RelatonJis::Hit] new hit
10
10
  #
11
- def self.create(node, collection)
12
- a = node.at("./a")
13
- hit = { id: a.at("./text()").text.strip, url: a["href"] }
11
+ def self.create(hit, collection)
12
+ # a = node.at("./a")
13
+ # hit = { id: a.at("./text()").text.strip, url: a["href"] }
14
14
  new hit, collection
15
15
  end
16
16
 
@@ -44,7 +44,14 @@ module RelatonJis
44
44
  end
45
45
 
46
46
  def fetch
47
- @fetch ||= Scraper.new(hit[:url]).fetch
47
+ return @fetch if defined? @fetch
48
+
49
+ # @fetch = Scraper.new(hit[:url]).fetch
50
+ yaml = Mechanize.new.get("#{Bibliography::GH_URL}#{hit[:file]}").body
51
+ hash = YAML.safe_load yaml
52
+ @fetch = RelatonJis::BibliographicItem.from_hash hash
53
+ @fetch.fetched = Date.today.to_s
54
+ @fetch
48
55
  end
49
56
  end
50
57
  end
@@ -9,6 +9,7 @@ module RelatonJis
9
9
  @prefix = "JIS"
10
10
  @defaultprefix = %r{^(JIS|TR)\s}
11
11
  @idtype = "JIS"
12
+ @datasets = %w[jis-webdesk]
12
13
  end
13
14
 
14
15
  # @param code [String]
@@ -19,6 +20,18 @@ module RelatonJis
19
20
  ::RelatonJis::Bibliography.get(code, date, opts)
20
21
  end
21
22
 
23
+ #
24
+ # Fetch all the docukents from a source
25
+ #
26
+ # @param [String] _source source name
27
+ # @param [Hash] opts
28
+ # @option opts [String] :output directory to output documents
29
+ # @option opts [String] :format
30
+ #
31
+ def fetch_data(_source, opts)
32
+ DataFetcher.fetch(**opts)
33
+ end
34
+
22
35
  # @param xml [String]
23
36
  # @return [RelatonJis::BibliographicItem]
24
37
  def from_xml(xml)
@@ -37,5 +50,12 @@ module RelatonJis
37
50
  def grammar_hash
38
51
  @grammar_hash ||= ::RelatonJis.grammar_hash
39
52
  end
53
+
54
+ #
55
+ # Remove index file
56
+ #
57
+ def remove_index_file
58
+ Relaton::Index.find_or_create(:jis, url: true, file: DataFetcher::INDEX_FILE).remove_file
59
+ end
40
60
  end
41
61
  end
@@ -1,7 +1,9 @@
1
+ # encoding: UTF-8
2
+
1
3
  module RelatonJis
2
4
  class Scraper
3
5
  ATTRS = %i[
4
- fetched title link abstract docid docnumber date type language script
6
+ title link abstract docid docnumber date type language script
5
7
  docstatus doctype ics contributor editorialgroup structuredidentifier
6
8
  ].freeze
7
9
 
@@ -24,9 +26,9 @@ module RelatonJis
24
26
  BibliographicItem.new(**attrs)
25
27
  end
26
28
 
27
- def fetch_fetched
28
- Date.today.to_s
29
- end
29
+ # def fetch_fetched
30
+ # Date.today.to_s
31
+ # end
30
32
 
31
33
  def fetch_title
32
34
  { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonJis
4
- VERSION = "1.19.1"
4
+ VERSION = "1.19.2"
5
5
  end
data/lib/relaton_jis.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "mechanize"
4
4
  require "relaton_iso_bib"
5
+ require "relaton/index"
5
6
  require_relative "relaton_jis/version"
6
7
  require_relative "relaton_jis/util"
7
8
  require_relative "relaton_jis/document_type"
@@ -12,6 +13,7 @@ require_relative "relaton_jis/scraper"
12
13
  require_relative "relaton_jis/bibliography"
13
14
  require_relative "relaton_jis/hit_collection"
14
15
  require_relative "relaton_jis/hit"
16
+ require_relative "relaton_jis/data_fetcher"
15
17
 
16
18
  module RelatonJis
17
19
  class Error < StandardError; end
data/relaton_jis.gemspec CHANGED
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
35
35
 
36
36
  # Uncomment to register a new dependency of your gem
37
37
  spec.add_dependency "mechanize", "~> 2.10"
38
+ spec.add_dependency "relaton-index", "~> 0.2.0"
38
39
  spec.add_dependency "relaton-iso-bib", "~> 1.19.0"
39
40
 
40
41
  # For more information and examples about making a new gem, check out our
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-jis
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.19.1
4
+ version: 1.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-26 00:00:00.000000000 Z
11
+ date: 2024-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: relaton-index
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.2.0
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: relaton-iso-bib
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -60,6 +74,7 @@ files:
60
74
  - lib/relaton_jis.rb
61
75
  - lib/relaton_jis/bibliographic_item.rb
62
76
  - lib/relaton_jis/bibliography.rb
77
+ - lib/relaton_jis/data_fetcher.rb
63
78
  - lib/relaton_jis/document_type.rb
64
79
  - lib/relaton_jis/hash_converter.rb
65
80
  - lib/relaton_jis/hit.rb