relaton-plateau 1.19.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0">
3
+ <include href="basicdoc.rng"/>
4
+ <include href="relaton-plateau.rng"/>
5
+ <start>
6
+ <choice>
7
+ <ref name="bibitem"/>
8
+ <ref name="bibdata"/>
9
+ </choice>
10
+ </start>
11
+ </grammar>
@@ -0,0 +1,127 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
3
+ <include href="biblio-standoc.rng">
4
+ <define name="BibDataExtensionType">
5
+ <optional>
6
+ <attribute name="schema-version"/>
7
+ </optional>
8
+ <ref name="doctype"/>
9
+ <optional>
10
+ <ref name="docsubtype"/>
11
+ </optional>
12
+ <optional>
13
+ <ref name="editorialgroup"/>
14
+ </optional>
15
+ <zeroOrMore>
16
+ <ref name="ics"/>
17
+ </zeroOrMore>
18
+ <ref name="structuredidentifier"/>
19
+ <optional>
20
+ <ref name="stagename"/>
21
+ </optional>
22
+ <optional>
23
+ <ref name="cover"/>
24
+ </optional>
25
+ <optional>
26
+ <ref name="filesize"/>
27
+ </optional>
28
+ </define>
29
+ <define name="DocumentType">
30
+ <choice>
31
+ <value>handbook</value>
32
+ <value>technical-report</value>
33
+ <value>annex</value>
34
+ </choice>
35
+ </define>
36
+ <define name="editorialgroup">
37
+ <element name="editorialgroup">
38
+ <ref name="ISOProjectGroup"/>
39
+ </element>
40
+ </define>
41
+ </include>
42
+ <define name="ISOProjectGroup">
43
+ <zeroOrMore>
44
+ <ref name="agency"/>
45
+ </zeroOrMore>
46
+ <oneOrMore>
47
+ <ref name="technical-committee"/>
48
+ </oneOrMore>
49
+ <zeroOrMore>
50
+ <ref name="subcommittee"/>
51
+ </zeroOrMore>
52
+ <zeroOrMore>
53
+ <ref name="workgroup"/>
54
+ </zeroOrMore>
55
+ <optional>
56
+ <ref name="secretariat"/>
57
+ </optional>
58
+ </define>
59
+ <define name="agency">
60
+ <element name="agency">
61
+ <text/>
62
+ </element>
63
+ </define>
64
+ <define name="documentnumber">
65
+ <element name="project-number">
66
+ <optional>
67
+ <attribute name="part">
68
+ <data type="int"/>
69
+ </attribute>
70
+ </optional>
71
+ <optional>
72
+ <attribute name="subpart">
73
+ <data type="int"/>
74
+ </attribute>
75
+ </optional>
76
+ <optional>
77
+ <attribute name="amendment">
78
+ <data type="int"/>
79
+ </attribute>
80
+ </optional>
81
+ <optional>
82
+ <attribute name="corrigendum">
83
+ <data type="int"/>
84
+ </attribute>
85
+ </optional>
86
+ <optional>
87
+ <attribute name="origyr">
88
+ <ref name="ISO8601Date"/>
89
+ </attribute>
90
+ </optional>
91
+ <text/>
92
+ </element>
93
+ </define>
94
+ <define name="subcommittee">
95
+ <element name="subcommittee">
96
+ <ref name="IsoWorkgroup"/>
97
+ </element>
98
+ </define>
99
+ <define name="workgroup">
100
+ <element name="workgroup">
101
+ <ref name="IsoWorkgroup"/>
102
+ </element>
103
+ </define>
104
+ <define name="secretariat">
105
+ <element name="secretariat">
106
+ <text/>
107
+ </element>
108
+ </define>
109
+ <define name="stagename">
110
+ <element name="stagename">
111
+ <optional>
112
+ <attribute name="abbreviation"/>
113
+ </optional>
114
+ <text/>
115
+ </element>
116
+ </define>
117
+ <define name="cover">
118
+ <element name="cover">
119
+ <ref name="image-no-id"/>
120
+ </element>
121
+ </define>
122
+ <define name="filesize">
123
+ <element name="filesize">
124
+ <data type="int"/>
125
+ </element>
126
+ </define>
127
+ </grammar>
@@ -0,0 +1,82 @@
1
+ require_relative "cover"
2
+ require_relative "stagename"
3
+
4
+ module Relaton
5
+ module Plateau
6
+ class BibItem < RelatonBib::BibliographicItem
7
+ # @return [Relaton::Plateau::Cover]
8
+ attr_reader :cover
9
+
10
+ # @return [Relaton::Plateau::Stagename]
11
+ attr_reader :stagename
12
+
13
+ # @return [Integer]
14
+ attr_reader :filesize
15
+
16
+ def initialize(**args)
17
+ @cover = args.delete(:cover)
18
+ @filesize = args.delete(:filesize)
19
+ @stagename = args.delete(:stagename)
20
+ super(**args)
21
+ end
22
+
23
+ #
24
+ # Fetch flavor schema version
25
+ #
26
+ # @return [String] schema version
27
+ #
28
+ def ext_schema
29
+ @ext_schema ||= schema_versions["relaton-model-plateau"]
30
+ end
31
+
32
+ # @param opts [Hash]
33
+ # @option opts [Nokogiri::XML::Builder] :builder XML builder
34
+ # @option opts [Boolean] bibdata
35
+ # @option opts [Symbol, nil] :date_format (:short), :full
36
+ # @option opts [String] :lang language
37
+ def to_xml(**opts)
38
+ super do |builder|
39
+ if opts[:bibdata] && has_ext_data?
40
+ ext = builder.ext do |b|
41
+ doctype&.to_xml b
42
+ b.subdoctype subdoctype if subdoctype
43
+ editorialgroup&.to_xml b
44
+ ics.each { |i| b.ics i }
45
+ structuredidentifier&.to_xml b
46
+ stagename&.to_xml b
47
+ cover.to_xml b
48
+ b.filesize filesize
49
+ end
50
+ ext["schema-version"] = ext_schema if !opts[:embedded] && respond_to?(:ext_schema) && ext_schema
51
+ end
52
+ end
53
+ end
54
+
55
+ def to_hash
56
+ hash = super
57
+ return hash unless has_ext_data?
58
+
59
+ hash["ext"] ||= {}
60
+ hash["ext"]["stagename"] = stagename.to_hash if stagename
61
+ hash["ext"]["cover"] = cover.to_hash if cover
62
+ hash["ext"]["filesize"] = filesize if filesize
63
+ hash
64
+ end
65
+
66
+ def to_asciibib(prefix = "")
67
+ pref = prefix.empty? ? "" : "#{prefix}."
68
+ output = super
69
+ output += stagename.to_asciibib prefix if stagename
70
+ output += cover.to_asciibib prefix if cover
71
+ output += "#{pref}filesize:: #{filesize}\n" if filesize
72
+ output
73
+ end
74
+
75
+ private
76
+
77
+ def has_ext_data?
78
+ doctype || subdoctype || editorialgroup || ics&.any? || structuredidentifier || stagename || cover || filesize
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,44 @@
1
+ module Relaton
2
+ module Plateau
3
+ module Bibliography
4
+ extend self
5
+
6
+ INDEXFILE = "index-v1"
7
+ GHURL = "https://raw.githubusercontent.com/relaton/relaton-data-plateau/main/"
8
+
9
+ def index
10
+ Relaton::Index.find_or_create :plateau, url: "#{GHURL}#{INDEXFILE}.zip", file: "#{INDEXFILE}.yaml"
11
+ end
12
+
13
+ def get(code, year = nil, opts = {})
14
+ Util.info "Fetching ...", key: code
15
+ bib = search(code)
16
+ if bib
17
+ Util.info "Found `#{bib.docidentifier.first.id}`", key: code
18
+ bib
19
+ else
20
+ Util.warn "Not found.", key: code
21
+ end
22
+ rescue StandardError => e
23
+ raise RelatonBib::RequestError, e.message
24
+ end
25
+
26
+ def search(code)
27
+ rows = index.search(code)
28
+ return unless rows.any?
29
+
30
+ row = rows.sort_by { |r| r[:id] }.last
31
+ fetch_doc code, **row
32
+ end
33
+
34
+ def fetch_doc(code, id:, file:)
35
+ resp = Net::HTTP.get_response URI("#{GHURL}#{file}")
36
+ return unless resp.is_a? Net::HTTPSuccess
37
+
38
+ hash = YAML.load(resp.body)
39
+ args = HashConverter.hash_to_bib hash
40
+ BibItem.new(**args)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,32 @@
1
+ module Relaton
2
+ module Plateau
3
+ class Cover
4
+ # @return [RelatonBib::Image]
5
+ attr_reader :image
6
+
7
+ #
8
+ # Initialize the Cover object
9
+ #
10
+ # @param [RelatonBib::Image] image image object
11
+ #
12
+ def initialize(image)
13
+ @image = image
14
+ end
15
+
16
+ def to_xml(builder)
17
+ builder.cover do |b|
18
+ image.to_xml b
19
+ end
20
+ end
21
+
22
+ def to_hash
23
+ image.to_hash
24
+ end
25
+
26
+ def to_asciibib(prefix = "")
27
+ pref = prefix.empty? ? "cover" : "#{prefix}.cover"
28
+ image.to_asciibib pref
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,18 @@
1
+ module Relaton
2
+ module Plateau
3
+ class DocumentType < RelatonBib::DocumentType
4
+ DOCTYPES = %w[handbook technical-report annex].freeze
5
+
6
+ def initialize(type:, abbreviation: nil)
7
+ check_type type
8
+ super
9
+ end
10
+
11
+ def check_type(type)
12
+ return if DOCTYPES.include? type
13
+
14
+ Util.warn "invalid doctype: `#{type}`"
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,219 @@
1
+ require "json"
2
+ require_relative "parser"
3
+ require_relative "handbook_parser"
4
+ require_relative "technical_report_parser"
5
+
6
+ module Relaton
7
+ module Plateau
8
+ # Fetcher class to fetch data from the Plateau website
9
+ class Fetcher
10
+ HANDBOOKS_URL = "https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
11
+ TECHNICAL_REPORTS_URL = "https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze
12
+
13
+ def initialize(output, format)
14
+ @output = output
15
+ @format = format
16
+ @ext = format.sub(/^bib/, "")
17
+ @files = []
18
+ end
19
+
20
+ def index
21
+ @index ||= Relaton::Index.find_or_create :plateau, file: "index-v1.yaml"
22
+ end
23
+
24
+ def self.fetch(source, output: "data", format: "yaml")
25
+ t1 = Time.now
26
+ puts "Started at: #{t1}"
27
+ FileUtils.mkdir_p output
28
+
29
+ if source == "plateau-handbooks"
30
+ new(output, format).extract_handbooks_data
31
+ elsif source == "plateau-technical-reports"
32
+ new(output, format).extract_technical_reports_data
33
+ else
34
+ puts "Invalid source: #{source}"
35
+ end
36
+
37
+ t2 = Time.now
38
+ puts "Stopped at: #{t2}"
39
+ puts "Done in: #{(t2 - t1).round} sec."
40
+ end
41
+
42
+ # def fetch_handbooks
43
+ # data = fetch_json_data(HANDBOOKS_URL)
44
+ # extracted_data = extract_handbooks_data(data)
45
+ # save_to_yaml(extracted_data, "handbooks.yaml")
46
+ # end
47
+
48
+ # def fetch_technical_reports
49
+ # data = fetch_json_data(TECHNICAL_REPORTS_URL)
50
+ # extracted_data = extract_technical_reports_data(data)
51
+ # save_to_yaml(extracted_data, "technical_reports.yaml")
52
+ # end
53
+
54
+ # Create a GET request with custom headers to mimic a browser
55
+ def create_request(uri)
56
+ request = Net::HTTP::Get.new(uri)
57
+ request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"
58
+ request["Accept"] = "*/*"
59
+ request["Accept-Language"] = "en-US,en;q=0.5"
60
+ request["Accept-Encoding"] = "gzip, deflate, br, zstd"
61
+ request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/"
62
+ request["purpose"] = "prefetch"
63
+ request["x-nextjs-data"] = "1"
64
+ request["Connection"] = "keep-alive"
65
+ request
66
+ end
67
+
68
+ # Handle different content encodings
69
+ def hadle_response(response)
70
+ if response["Content-Encoding"] == "gzip"
71
+ Zlib::GzipReader.new(StringIO.new(response.body)).read
72
+ elsif response["Content-Encoding"] == "deflate"
73
+ Zlib::Inflate.inflate(response.body)
74
+ else
75
+ response.body
76
+ end
77
+ end
78
+
79
+ # Fetch JSON data from a URL with custom headers
80
+ #
81
+ # @param [String] url The URL to fetch JSON data from
82
+ # @return [Hash] The parsed JSON data
83
+ def fetch_json_data(url)
84
+ uri = URI(url)
85
+
86
+ request = create_request(uri)
87
+
88
+ # Send the request and get the response
89
+ response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
90
+ http.request(request)
91
+ end
92
+
93
+ # Check if the response is successful
94
+ unless response.code.to_i == 200
95
+ Util.warn "Failed to fetch data: #{response.code} #{response.message}"
96
+ return {}
97
+ end
98
+
99
+ body = hadle_response(response)
100
+
101
+ # Parse the JSON response
102
+ JSON.parse(body)
103
+ rescue StandardError => e
104
+ # Handle any errors during the fetching process
105
+ Util.error "Error fetching JSON data from #{url}: #{e.message}"
106
+ {}
107
+ end
108
+
109
+ #
110
+ # Extract data for handbooks
111
+ #
112
+ def extract_handbooks_data
113
+ data = fetch_json_data(HANDBOOKS_URL)
114
+ Util.info "Extracting handbooks data..."
115
+ data["pageProps"]["handbooks"]["nodes"].each do |entry|
116
+ handbook = entry["handbook"]
117
+ versions = handbook["versions"]
118
+
119
+ description_parts = handbook["description"]&.split("<br />") || ["", ""]
120
+ title_en = description_parts[0].strip if description_parts[0]
121
+ abstract = description_parts[1].strip if description_parts[1]
122
+
123
+ doctype = entry["slug"].match("-") ? "annex" : "handbook"
124
+
125
+ versions.each do |version|
126
+ item = HandbookParser.new(
127
+ version: version, entry: entry, title_en: title_en, abstract: abstract, doctype: doctype
128
+ ).parse
129
+ save_document(item)
130
+
131
+ # ::Relaton::Plateau::BibItem.new(
132
+ # pubid: "PLATEAU Handbook ##{entry["slug"]}",
133
+ # title_jp: handbook["title"],
134
+ # title_en: title_en,
135
+ # abstract_jp: abstract_jp,
136
+ # cover: "https://www.mlit.go.jp/#{handbook["thumbnail"]["mediaItemUrl"]}",
137
+ # type: document_type,
138
+ # publication_date: Date.parse(version["date"].gsub(".", "-")),
139
+ # url_pdf: version["pdf"],
140
+ # url_html: version["html"],
141
+ # filesize: version["filesize"].to_i,
142
+ # edition_number: version["title"].match(/\d\.\d/)[0],
143
+ # edition_text: version["title"],
144
+ # # tags: [],
145
+ # )
146
+ end
147
+ end
148
+ index.save
149
+ end
150
+
151
+ #
152
+ # Extract data for technical reports
153
+ #
154
+ def extract_technical_reports_data
155
+ data = fetch_json_data(TECHNICAL_REPORTS_URL)
156
+ Util.info "Extracting technical reports data..."
157
+ data["pageProps"]["nodes"].map do |entry|
158
+ save_document(TechnicalReportParser.new(entry).parse)
159
+
160
+ # technical_report = entry["technicalReport"]
161
+
162
+ # ::Relaton::Plateau::BibItem.new(
163
+ # title_jp: technical_report["title"],
164
+ # abstract_jp: technical_report["subtitle"],
165
+ # cover: "https://www.mlit.go.jp/#{technical_report["thumbnail"]["mediaItemUrl"]}",
166
+ # pubid: "PLATEAU Tech Report ##{entry["slug"]}",
167
+ # type: "technical-report",
168
+ # subtype: entry["technicalReportCategories"]["nodes"].map { |cat| cat["name"] },
169
+ # publication_date: Date.parse(entry["date"]),
170
+ # url_pdf: technical_report["pdf"],
171
+ # filesize: technical_report["filesize"].to_i,
172
+ # edition_number: "1.0",
173
+ # edition_text: "1.0",
174
+ # tags: entry["globalTags"]["nodes"].map { |tag| tag["name"] },
175
+ # )
176
+
177
+ end
178
+ index.save
179
+ end
180
+
181
+ # def self.save_to_yaml(data, filename)
182
+ # File.open(filename, "w") do |file|
183
+ # file.write(data.to_yaml)
184
+ # end
185
+ # puts "Data saved to #{filename}."
186
+ # end
187
+
188
+ def save_document(item)
189
+ id = item.docidentifier.first.id
190
+ file = file_name id
191
+ if @files.include?(file)
192
+ Util.warn "File #{file} already exists, skipping.", key: id
193
+ else
194
+ File.write(file, serialize(item))
195
+ @files << file
196
+ index.add_or_update id, file
197
+ end
198
+ end
199
+
200
+ def file_name(id)
201
+ name = id.gsub(/\s+/, "_").gsub(/\W+/, "").downcase
202
+ if id.match?(/民間活用編/)
203
+ name += "_private"
204
+ elsif id.match?(/公共活用編/)
205
+ name += "_public"
206
+ end
207
+ File.join(@output, "#{name}.#{@ext}")
208
+ end
209
+
210
+ def serialize(item)
211
+ case @format
212
+ when "yaml" then item.to_hash.to_yaml
213
+ when "xml" then item.to_xml bibdata: true
214
+ else item.send("to_#{@format}")
215
+ end
216
+ end
217
+ end
218
+ end
219
+ end
@@ -0,0 +1,70 @@
1
+ # encoding: UTF-8
2
+
3
+ module Relaton
4
+ module Plateau
5
+ class HandbookParser < Parser
6
+ def initialize(version:, entry:, title_en:, abstract:, doctype:)
7
+ @version = version
8
+ @entry = entry
9
+ super entry["handbook"]
10
+ @title_en = title_en
11
+ @abstract = abstract
12
+ @doctype = doctype
13
+ end
14
+
15
+ private
16
+
17
+ def edition
18
+ @edition ||= @version["title"].split.first
19
+ end
20
+
21
+ def parse_docid
22
+ super << create_docid("PLATEAU Handbook ##{@entry["slug"]} #{edition}")
23
+ end
24
+
25
+ def parse_title
26
+ title = super
27
+ title << create_title(@title_en, "en", "Latn") if @title_en
28
+ title
29
+ end
30
+
31
+ def parse_abstract
32
+ abstr = super
33
+ abstr << create_formatted_string(@abstract) if @abstract
34
+ abstr
35
+ end
36
+
37
+ def parse_edition
38
+ number = edition.match(/\d\.\d/)[0]
39
+ RelatonBib::Edition.new(content: edition, number: number)
40
+ end
41
+
42
+ def parse_doctype
43
+ DocumentType.new type: @doctype
44
+ end
45
+
46
+ def parse_date
47
+ super << create_date(@version["date"].gsub(".", "-"))
48
+ end
49
+
50
+ def parse_link
51
+ %w[pdf html].map do |type|
52
+ next unless @version[type]
53
+
54
+ create_link(@version[type], type)
55
+ end.compact
56
+ end
57
+
58
+ def parse_filesize
59
+ @version["filesize"].to_i
60
+ end
61
+
62
+ def parse_structuredidentifier
63
+ strid = RelatonBib::StructuredIdentifier.new(
64
+ type: "Handbook", agency: ["PLATEAU"], docnumber: @entry["slug"], edition: edition
65
+ )
66
+ RelatonBib::StructuredIdentifierCollection.new [strid]
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,50 @@
1
+ module Relaton
2
+ module Plateau
3
+ module HashConverter
4
+ include RelatonBib::HashConverter
5
+ extend self
6
+ # @param args [Hash]
7
+ # @return [Hash]
8
+ def hash_to_bib(args)
9
+ ret = super
10
+ return unless ret
11
+ return ret unless ret[:ext]
12
+
13
+ hash_to_bib_cover ret
14
+ hash_to_bib_filesize ret
15
+ hash_to_bib_stagename ret
16
+ ret.delete :ext
17
+ ret
18
+ end
19
+
20
+ def hash_to_bib_cover(ret)
21
+ return unless ret[:ext][:cover]
22
+
23
+ image = ret[:ext][:cover][:image]
24
+ ret[:cover] = Cover.new(RelatonBib::Image.new(**image))
25
+ end
26
+
27
+ def hash_to_bib_filesize(ret)
28
+ return unless ret[:ext][:filesize]
29
+
30
+ ret[:filesize] = ret[:ext][:filesize].to_i
31
+ end
32
+
33
+ def hash_to_bib_stagename(ret)
34
+ return unless ret[:ext][:stagename]
35
+
36
+ ret[:stagename] = Stagename.new(**ret[:ext][:stagename])
37
+ end
38
+
39
+ # @param item_hash [Hash]
40
+ # @return [RelatonCie::BibliographicItem]
41
+ def bib_item(item_hash)
42
+ BibItem.new(**item_hash)
43
+ end
44
+
45
+ def create_doctype(**args)
46
+ DocumentType.new(**args)
47
+ end
48
+ end
49
+ end
50
+ end