relaton-jis 1.19.1 → 1.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c223759d15403f399f92edd2272b156b0e3c7f6c7dc274babb6b24cf02f99326
4
- data.tar.gz: 387192eb0a06f3f0cca389decf2e0aa9d13193aed0f52d0e8949c7089ce77b99
3
+ metadata.gz: 8b816ec16d7873fe9c8379b12dbe5f48f2874a50a5e113fdb439abaa6eb0a081
4
+ data.tar.gz: 94d2b6a1be560ec226f95da9eb6c8973af93fa983eb86ceb5cbaba8b72c3fbc3
5
5
  SHA512:
6
- metadata.gz: a1ceb478a730fba7ce00809420161cc996b1ade2e3d590fab98c75668ee1450df0be2daebf1f5e3cedd8fe099121bd5e2ed01d73b56b5b755afee12e2a009302
7
- data.tar.gz: 1c7fe1151443babf282595293d6c41f5945fe75d56481f06edd60a5c00f61b15f9ff65a428de6827494f08df3a64f5eb620a6fdb2f1832f826d4de346dee28f7
6
+ metadata.gz: f255822a0bc60c34d5b6d1074dd4b280b7626d64084710fe78a19495b8d1f6a9fe55ba2afa396ad7227024d6207c4de63ee904b8e395276e3eadbd626d9e1e93
7
+ data.tar.gz: f8b91c43a6f9d2a8a84da5f4812a135c841e0aa909938c30a6c6d1e885b338dfb83e95303d42fbd189918606d93154393f2d521ed5eea7d1cbb2adaf79f7b395
data/README.adoc CHANGED
@@ -149,6 +149,24 @@ item.link
149
149
  @type="pdf">]
150
150
  ----
151
151
 
152
+ === Fetch data
153
+
154
+ This gem scrapes the https://webdesk.jsa.or.jp/books/W11M0270 pages to fetch the JIS Standards metadata. By default the data is saved in the `./data` folder in YAML format.
155
+
156
+ The method `RelatonJis::DataFetcher.fetch(output: "data", format: "yaml")` fetches all the documents from the dataset and saves them to the `./data` folder in YAML format.
157
+ Arguments:
158
+
159
+ - `output` - folder to save documents (default './data').
160
+ - `format` - the format in which the documents are saved. Possible formats are: `yaml`, `xml`, `bibxml` (default `yaml`).
161
+
162
+ [source,ruby]
163
+ ----
164
+ RelatonJis::DataFetcher.fetch
165
+ Start fetching JIS data at 2024-09-27 17:49:40 -0400
166
+ Fetching JIS data finished at 2024-09-27 18:40:11 -0400. It took 3031.0 seconds.
167
+ => nil
168
+ ----
169
+
152
170
  === Logging
153
171
 
154
172
  RelatonJis uses the relaton-logger gem for logging. By default, it logs to STDOUT. To change the log levels and add other loggers, read the https://github.com/relaton/relaton-logger#usage[relaton-logger] documentation.
@@ -2,7 +2,8 @@ module RelatonJis
2
2
  module Bibliography
3
3
  extend self
4
4
 
5
- SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
5
+ # SOURCE = "https://webdesk.jsa.or.jp/books/W11M".freeze
6
+ GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-jis/refs/heads/main/".freeze
6
7
 
7
8
  #
8
9
  # Search JIS by keyword
@@ -13,14 +14,9 @@ module RelatonJis
13
14
  # @return [RelatonJis::HitCollection] search result
14
15
  #
15
16
  def search(code, year = nil)
16
- agent = Mechanize.new
17
- resp = agent.post "#{SOURCE}0270/index", dantai: "JIS", bunsyo_id: code, searchtype2: "1", status_1: "1", status_2: "1"
18
- disp = JSON.parse resp.body
19
- # raise RelatonBib::RequestError, "No results found for #{code}" if disp["disp_screen"].nil?
20
- return unless disp["status"]
21
-
22
- result = agent.get "#{SOURCE}0070/index"
23
- HitCollection.new code, year, result: result.xpath("//div[@class='blockGenaral']")
17
+ index = Relaton::Index.find_or_create(:jis, url: "#{GH_URL}index-v1.zip", file: DataFetcher::INDEX_FILE)
18
+ result = index.search(code).sort_by { |h| h[:id] }
19
+ HitCollection.new code, year, result: result # .xpath("//div[@class='blockGenaral']")
24
20
  end
25
21
 
26
22
  #
@@ -0,0 +1,156 @@
1
+ module RelatonJis
2
+ class DataFetcher
3
+ URL = "https://webdesk.jsa.or.jp/books/".freeze
4
+ INDEX_FILE = "index-v1.yaml".freeze
5
+
6
+ def initialize(output, format)
7
+ @output = output
8
+ @format = format
9
+ @ext = format.sub("bibxml", "xml")
10
+ @files = Set.new
11
+ @queue = SizedQueue.new 10
12
+ @threads = create_thread_pool 5
13
+ @mutex = Mutex.new
14
+ end
15
+
16
+ def self.fetch(output: "data", format: "yaml")
17
+ start_time = Time.now
18
+ puts "Start fetching JIS data at #{start_time}"
19
+ FileUtils.mkdir_p output
20
+ new(output, format).fetch
21
+ stop_time = Time.now
22
+ puts "Fetching JIS data finished at #{stop_time}. It took #{stop_time - start_time} seconds."
23
+ end
24
+
25
+ def create_thread_pool(size)
26
+ Array.new(size) do
27
+ Thread.new do
28
+ until (url = @queue.shift) == :END
29
+ fetch_doc url
30
+ end
31
+ end
32
+ end
33
+ end
34
+
35
+ def fetch_doc(url) # rubocop:disable Metrics/MethodLength
36
+ attempts = 0
37
+ begin
38
+ bib = Scraper.new(url).fetch
39
+ rescue StandardError => e
40
+ attempts += 1
41
+ if attempts < 5
42
+ sleep 2
43
+ retry
44
+ else
45
+ Util.warn "URL: #{url}"
46
+ Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
47
+ end
48
+ else
49
+ save_doc bib, url
50
+ end
51
+ end
52
+
53
+ def fetch
54
+ return unless initial_post
55
+
56
+ resp = agent.get "#{URL}W11M0070/index"
57
+ parse_page resp
58
+ index.save
59
+ end
60
+
61
+ def initial_post
62
+ return true if @initial_time && Time.now - @initial_time < 600
63
+
64
+ body = { record: 0, dantai: "JIS", searchtype2: 1, status_1: 1, status_2: 2 }
65
+ resp = agent.post "#{URL}W11M0270/index", body
66
+ disp = JSON.parse resp.body
67
+ @initial_time = Time.now
68
+ disp["status"] || Util.warn("No results found for JIS")
69
+ end
70
+
71
+ def agent
72
+ @agent ||= Mechanize.new
73
+ end
74
+
75
+ def parse_page(resp)
76
+ while resp
77
+ resp.xpath('//div[@class="blockGenaral"]/a').each { |a| @queue << a[:href] }
78
+ offset = parse_offset resp
79
+ break if offset >= count # no more pages
80
+
81
+ resp = get_next_page(offset)
82
+ end
83
+ end_threads_and_wait
84
+ end
85
+
86
+ def parse_offset(resp) # rubocop:disable Metrics/AbcSize
87
+ if resp.at('//*[@id="btnPaging"]') # first page
88
+ @count = resp.at('//script[contains(.,"var count =")]').text.match(/var count = (\d+);/)[1]
89
+ resp.at("//*[@id='offset']")[:value].to_i
90
+ else
91
+ script = resp.at("//script").text
92
+ script.match(/\("offset"\)\.value = '(\d+)'/)[1].to_i
93
+ end
94
+ end
95
+
96
+ def end_threads_and_wait
97
+ @threads.size.times { @queue << :END }
98
+ @queue.close
99
+ @threads.each(&:join)
100
+ end
101
+
102
+ def count
103
+ @count.to_i
104
+ end
105
+
106
+ def get_next_page(offset) # rubocop:disable Metrics/MethodLength
107
+ attempts = 0
108
+ begin
109
+ if initial_post
110
+ agent.post "#{URL}W11M0070/getAddList", search_type: "JIS", offset: offset
111
+ end
112
+ rescue StandardError => e
113
+ attempts += 1
114
+ if attempts < 5
115
+ sleep 2
116
+ retry
117
+ else
118
+ Util.warn "#{e.message}\n#{e.backtrace[0..6].join("\n")}"
119
+ end
120
+ end
121
+ end
122
+
123
+ def save_doc(bib, url) # rubocop:disable Metrics/MethodLength
124
+ return unless bib
125
+
126
+ id = bib.docidentifier.find(&:primary).id
127
+ file = file id
128
+ @mutex.synchronize do
129
+ if @files.include?(file)
130
+ Util.warn "File #{file} already exists. Duplication URL: #{url}"
131
+ else
132
+ @files << file
133
+ File.write file, serialize(bib), encoding: "UTF-8"
134
+ index.add_or_update id, file
135
+ end
136
+ end
137
+ end
138
+
139
+ def index
140
+ @index ||= Relaton::Index.find_or_create :jis, file: INDEX_FILE
141
+ end
142
+
143
+ def file(id)
144
+ name = id.gsub(/[:\/\s]/, "_")
145
+ File.join @output, "#{name}.#{@ext}"
146
+ end
147
+
148
+ def serialize(bib)
149
+ case @format
150
+ when "yaml" then bib.to_hash.to_yaml
151
+ when "xml" then bib.to_xml bibdata: true
152
+ else bib.send "to_#{@format}"
153
+ end
154
+ end
155
+ end
156
+ end
@@ -1,6 +1,6 @@
1
1
  module RelatonJis
2
2
  module HashConverter
3
- include RelatonBib::HashConverter
3
+ include RelatonIsoBib::HashConverter
4
4
  extend self
5
5
 
6
6
  # @param item_hash [Hash]
@@ -8,9 +8,9 @@ module RelatonJis
8
8
  #
9
9
  # @return [RelatonJis::Hit] new hit
10
10
  #
11
- def self.create(node, collection)
12
- a = node.at("./a")
13
- hit = { id: a.at("./text()").text.strip, url: a["href"] }
11
+ def self.create(hit, collection)
12
+ # a = node.at("./a")
13
+ # hit = { id: a.at("./text()").text.strip, url: a["href"] }
14
14
  new hit, collection
15
15
  end
16
16
 
@@ -44,7 +44,14 @@ module RelatonJis
44
44
  end
45
45
 
46
46
  def fetch
47
- @fetch ||= Scraper.new(hit[:url]).fetch
47
+ return @fetch if defined? @fetch
48
+
49
+ # @fetch = Scraper.new(hit[:url]).fetch
50
+ yaml = Mechanize.new.get("#{Bibliography::GH_URL}#{hit[:file]}").body
51
+ hash = YAML.safe_load yaml
52
+ @fetch = RelatonJis::BibliographicItem.from_hash hash
53
+ @fetch.fetched = Date.today.to_s
54
+ @fetch
48
55
  end
49
56
  end
50
57
  end
@@ -9,6 +9,7 @@ module RelatonJis
9
9
  @prefix = "JIS"
10
10
  @defaultprefix = %r{^(JIS|TR)\s}
11
11
  @idtype = "JIS"
12
+ @datasets = %w[jis-webdesk]
12
13
  end
13
14
 
14
15
  # @param code [String]
@@ -19,6 +20,18 @@ module RelatonJis
19
20
  ::RelatonJis::Bibliography.get(code, date, opts)
20
21
  end
21
22
 
23
+ #
24
+ # Fetch all the docukents from a source
25
+ #
26
+ # @param [String] _source source name
27
+ # @param [Hash] opts
28
+ # @option opts [String] :output directory to output documents
29
+ # @option opts [String] :format
30
+ #
31
+ def fetch_data(_source, opts)
32
+ DataFetcher.fetch(**opts)
33
+ end
34
+
22
35
  # @param xml [String]
23
36
  # @return [RelatonJis::BibliographicItem]
24
37
  def from_xml(xml)
@@ -37,5 +50,12 @@ module RelatonJis
37
50
  def grammar_hash
38
51
  @grammar_hash ||= ::RelatonJis.grammar_hash
39
52
  end
53
+
54
+ #
55
+ # Remove index file
56
+ #
57
+ def remove_index_file
58
+ Relaton::Index.find_or_create(:jis, url: true, file: DataFetcher::INDEX_FILE).remove_file
59
+ end
40
60
  end
41
61
  end
@@ -1,7 +1,9 @@
1
+ # encoding: UTF-8
2
+
1
3
  module RelatonJis
2
4
  class Scraper
3
5
  ATTRS = %i[
4
- fetched title link abstract docid docnumber date type language script
6
+ title link abstract docid docnumber date type language script
5
7
  docstatus doctype ics contributor editorialgroup structuredidentifier
6
8
  ].freeze
7
9
 
@@ -24,9 +26,9 @@ module RelatonJis
24
26
  BibliographicItem.new(**attrs)
25
27
  end
26
28
 
27
- def fetch_fetched
28
- Date.today.to_s
29
- end
29
+ # def fetch_fetched
30
+ # Date.today.to_s
31
+ # end
30
32
 
31
33
  def fetch_title
32
34
  { "ja" => "Jpan", "en" => "Lant" }.map.with_index do |(lang, script), i|
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonJis
4
- VERSION = "1.19.1"
4
+ VERSION = "1.19.2"
5
5
  end
data/lib/relaton_jis.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "mechanize"
4
4
  require "relaton_iso_bib"
5
+ require "relaton/index"
5
6
  require_relative "relaton_jis/version"
6
7
  require_relative "relaton_jis/util"
7
8
  require_relative "relaton_jis/document_type"
@@ -12,6 +13,7 @@ require_relative "relaton_jis/scraper"
12
13
  require_relative "relaton_jis/bibliography"
13
14
  require_relative "relaton_jis/hit_collection"
14
15
  require_relative "relaton_jis/hit"
16
+ require_relative "relaton_jis/data_fetcher"
15
17
 
16
18
  module RelatonJis
17
19
  class Error < StandardError; end
data/relaton_jis.gemspec CHANGED
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
35
35
 
36
36
  # Uncomment to register a new dependency of your gem
37
37
  spec.add_dependency "mechanize", "~> 2.10"
38
+ spec.add_dependency "relaton-index", "~> 0.2.0"
38
39
  spec.add_dependency "relaton-iso-bib", "~> 1.19.0"
39
40
 
40
41
  # For more information and examples about making a new gem, check out our
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-jis
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.19.1
4
+ version: 1.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-26 00:00:00.000000000 Z
11
+ date: 2024-09-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: relaton-index
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.2.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.2.0
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: relaton-iso-bib
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -60,6 +74,7 @@ files:
60
74
  - lib/relaton_jis.rb
61
75
  - lib/relaton_jis/bibliographic_item.rb
62
76
  - lib/relaton_jis/bibliography.rb
77
+ - lib/relaton_jis/data_fetcher.rb
63
78
  - lib/relaton_jis/document_type.rb
64
79
  - lib/relaton_jis/hash_converter.rb
65
80
  - lib/relaton_jis/hit.rb