relaton-asme 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +25 -0
- data/README.adoc +539 -0
- data/lib/relaton_asme/fetcher/asme_format.rb +26 -0
- data/lib/relaton_asme/fetcher/asme_publication.rb +24 -0
- data/lib/relaton_asme/fetcher/asme_publication_edition.rb +58 -0
- data/lib/relaton_asme/fetcher/basic_page.rb +51 -0
- data/lib/relaton_asme/fetcher/index_page.rb +127 -0
- data/lib/relaton_asme/fetcher/publication_page.rb +130 -0
- data/lib/relaton_asme/fetcher/runner.rb +169 -0
- data/lib/relaton_asme/version.rb +5 -0
- data/lib/relaton_asme.rb +14 -0
- metadata +85 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: e288ff1fe94fd8f66ca1d903b2c2d83d97b2d3b35555547e0d8e0bfe1eb04f80
|
|
4
|
+
data.tar.gz: beb6d02b7c13f3ff37fecec8e22a4ffd9cffe4429c656e286e4c853f283d5539
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 4cad93db00b8e5ccd5f75e00e07852735f19e9c16e63d5e6485e671df791f2b726c856a41178e26d6e912cc7ebe684f5faa79457bead82283b29f90f81ce9748
|
|
7
|
+
data.tar.gz: 5a7b3379cfb3d7571b5d9c1a685a83b9c822ea7dea55dcb0f6ff40255551632014ce7798a73a288674a114432a9b440863c13d7f41969fc28be2ad6bb90ea8e9
|
data/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
BSD 2-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025, Ribose Inc.
|
|
4
|
+
All rights reserved.
|
|
5
|
+
|
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
|
8
|
+
|
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
10
|
+
list of conditions and the following disclaimer.
|
|
11
|
+
|
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
and/or other materials provided with the distribution.
|
|
15
|
+
|
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.adoc
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
= Relaton for ASME
|
|
2
|
+
|
|
3
|
+
image:https://img.shields.io/gem/v/relaton-asme.svg[Gem Version,link=https://rubygems.org/gems/relaton-asme]
|
|
4
|
+
image:https://img.shields.io/github/license/relaton/relaton-asme.svg[License]
|
|
5
|
+
image:https://github.com/relaton/relaton-asme/actions/workflows/rake.yml/badge.svg[Build Status,link=https://github.com/relaton/relaton-asme/actions/workflows/rake.yml]
|
|
6
|
+
|
|
7
|
+
Fetch and parse ASME (American Society of Mechanical Engineers) standards and
|
|
8
|
+
codes from the official ASME website.
|
|
9
|
+
|
|
10
|
+
== Purpose
|
|
11
|
+
|
|
12
|
+
Relaton for ASME provides a programmatic interface to fetch ASME standards and codes publications.
|
|
13
|
+
|
|
14
|
+
It accesses the ASME JSON API for publication listings and parses JSON-LD
|
|
15
|
+
structured data from publication pages to extract complete bibliographic
|
|
16
|
+
metadata.
|
|
17
|
+
|
|
18
|
+
== Features
|
|
19
|
+
|
|
20
|
+
* <<fetching-publication-listings,Fetching publication listings>> - Access ASME JSON API with pagination
|
|
21
|
+
* <<parsing-publication-metadata,Parsing publication metadata>> - Extract data from JSON-LD structured content
|
|
22
|
+
* <<data-serialization,Data serialization>> - Export to JSON/YAML formats
|
|
23
|
+
* <<batch-processing,Batch processing>> - Fetch multiple publications efficiently
|
|
24
|
+
* <<pre-scraped-database,Pre-scraped database>> - Access 4,928 ASME editions (1967-2025)
|
|
25
|
+
|
|
26
|
+
== Architecture
|
|
27
|
+
|
|
28
|
+
=== General
|
|
29
|
+
|
|
30
|
+
RelatonAsme uses a two-stage data pipeline with object-oriented, model-driven architecture:
|
|
31
|
+
|
|
32
|
+
.Data Pipeline
|
|
33
|
+
[source]
|
|
34
|
+
----
|
|
35
|
+
Stage 1: ASME JSON API
|
|
36
|
+
│
|
|
37
|
+
└─► IndexPage fetches from /api/products
|
|
38
|
+
│
|
|
39
|
+
└─► Returns publication URLs
|
|
40
|
+
│
|
|
41
|
+
▼
|
|
42
|
+
Stage 2: JSON-LD Parsing
|
|
43
|
+
│
|
|
44
|
+
└─► PublicationPage parses <script type="application/ld+json">
|
|
45
|
+
│
|
|
46
|
+
└─► Extracts Schema.org Product data
|
|
47
|
+
│
|
|
48
|
+
▼
|
|
49
|
+
Stage 3: Data Models
|
|
50
|
+
│
|
|
51
|
+
└─► AsmePublicationEdition (with URL, metadata, pricing)
|
|
52
|
+
----
|
|
53
|
+
|
|
54
|
+
=== Data model hierarchy
|
|
55
|
+
|
|
56
|
+
[source]
|
|
57
|
+
----
|
|
58
|
+
AsmePublication (Container)
|
|
59
|
+
│
|
|
60
|
+
└── AsmePublicationEdition[] (Primary Model - Array)
|
|
61
|
+
│
|
|
62
|
+
├── url: String (publication page URL)
|
|
63
|
+
├── document_type: String ("standard")
|
|
64
|
+
├── title: String (WITHOUT designator prefix)
|
|
65
|
+
├── document_identifier: String (e.g., "BPVC.I-2025")
|
|
66
|
+
├── designator: String (e.g., "BPVC.I")
|
|
67
|
+
├── edition_year: String (e.g., "2025")
|
|
68
|
+
├── publisher: String ("ASME")
|
|
69
|
+
├── publish_date: String
|
|
70
|
+
├── language: String ("EN")
|
|
71
|
+
├── number_of_pages: Integer (optional)
|
|
72
|
+
├── description: String (optional)
|
|
73
|
+
├── topics: String[] (optional)
|
|
74
|
+
└── formats: AsmeFormat[] (optional)
|
|
75
|
+
│
|
|
76
|
+
├── format_type: String (e.g., "PDF", "Print Book")
|
|
77
|
+
├── price: String (e.g., "$675.00")
|
|
78
|
+
└── isbn: String (optional)
|
|
79
|
+
----
|
|
80
|
+
|
|
81
|
+
== Installation
|
|
82
|
+
|
|
83
|
+
Add this line to your application's Gemfile:
|
|
84
|
+
|
|
85
|
+
[source,ruby]
|
|
86
|
+
----
|
|
87
|
+
gem "relaton-asme"
|
|
88
|
+
----
|
|
89
|
+
|
|
90
|
+
And then execute:
|
|
91
|
+
|
|
92
|
+
[source,shell]
|
|
93
|
+
----
|
|
94
|
+
bundle install
|
|
95
|
+
----
|
|
96
|
+
|
|
97
|
+
Or install it yourself as:
|
|
98
|
+
|
|
99
|
+
[source,shell]
|
|
100
|
+
----
|
|
101
|
+
gem install relaton-asme
|
|
102
|
+
----
|
|
103
|
+
|
|
104
|
+
== Usage
|
|
105
|
+
|
|
106
|
+
[[fetching-publication-listings]]
|
|
107
|
+
=== Fetching publication listings
|
|
108
|
+
|
|
109
|
+
==== General
|
|
110
|
+
|
|
111
|
+
The `IndexPage` class fetches publication listings from the ASME JSON API at
|
|
112
|
+
`https://www.asme.org/api/products`.
|
|
113
|
+
|
|
114
|
+
==== Fetching publication URLs from API
|
|
115
|
+
|
|
116
|
+
Syntax:
|
|
117
|
+
|
|
118
|
+
[source,ruby]
|
|
119
|
+
----
|
|
120
|
+
index = RelatonAsme::Fetcher::IndexPage.new(
|
|
121
|
+
page_number: {page_number}, <1>
|
|
122
|
+
per_page: {per_page} <2>
|
|
123
|
+
)
|
|
124
|
+
urls = index.publication_urls <3>
|
|
125
|
+
----
|
|
126
|
+
<1> Page number to fetch (1-based, default: 1)
|
|
127
|
+
<2> Number of results per page (default: 100, max: 100)
|
|
128
|
+
<3> Returns array of publication URLs
|
|
129
|
+
|
|
130
|
+
Where,
|
|
131
|
+
|
|
132
|
+
`page_number`:: The page number to fetch, starting from 1. Default is 1.
|
|
133
|
+
`per_page`:: Number of publications to fetch per page. Default is 100.
|
|
134
|
+
`publication_urls`:: Returns an array of publication URLs for each edition/format combination.
|
|
135
|
+
|
|
136
|
+
.Fetching URLs from the first page of API
|
|
137
|
+
[example]
|
|
138
|
+
====
|
|
139
|
+
[source,ruby]
|
|
140
|
+
----
|
|
141
|
+
require "relaton_asme"
|
|
142
|
+
|
|
143
|
+
index = RelatonAsme::Fetcher::IndexPage.new(page_number: 1, per_page: 10)
|
|
144
|
+
urls = index.publication_urls
|
|
145
|
+
|
|
146
|
+
puts "Found #{urls.size} publication URLs"
|
|
147
|
+
urls.first(5).each { |url| puts url }
|
|
148
|
+
----
|
|
149
|
+
|
|
150
|
+
This queries the ASME API and returns URLs for each edition/format combination.
|
|
151
|
+
|
|
152
|
+
Example output:
|
|
153
|
+
----
|
|
154
|
+
Found 30 publication URLs
|
|
155
|
+
https://www.asme.org/codes-standards/find-codes-standards/bpvc-i-.../2025/print-book
|
|
156
|
+
https://www.asme.org/codes-standards/find-codes-standards/bpvc-i-.../2023/print-book
|
|
157
|
+
----
|
|
158
|
+
====
|
|
159
|
+
|
|
160
|
+
[[parsing-publication-metadata]]
|
|
161
|
+
=== Parsing publication metadata
|
|
162
|
+
|
|
163
|
+
==== General
|
|
164
|
+
|
|
165
|
+
The `PublicationPage` class parses JSON-LD structured data from individual publication pages. Each page may contain multiple editions (different years) as separate offers.
|
|
166
|
+
|
|
167
|
+
==== Fetching publication data from JSON-LD
|
|
168
|
+
|
|
169
|
+
Syntax:
|
|
170
|
+
|
|
171
|
+
[source,ruby]
|
|
172
|
+
----
|
|
173
|
+
page = RelatonAsme::Fetcher::PublicationPage.new({url}) <1>
|
|
174
|
+
publication = page.to_data <2>
|
|
175
|
+
----
|
|
176
|
+
<1> Publication URL to fetch
|
|
177
|
+
<2> Returns `AsmePublication` object with editions extracted from JSON-LD offers
|
|
178
|
+
|
|
179
|
+
Where,
|
|
180
|
+
|
|
181
|
+
`url`:: The full URL to the publication page on asme.org
|
|
182
|
+
`to_data`:: Returns an `AsmePublication` object containing all editions from the page's JSON-LD data
|
|
183
|
+
|
|
184
|
+
.Fetching a publication via JSON-LD
|
|
185
|
+
[example]
|
|
186
|
+
====
|
|
187
|
+
[source,ruby]
|
|
188
|
+
----
|
|
189
|
+
url = "https://www.asme.org/codes-standards/find-codes-standards/" \
|
|
190
|
+
"temperature-measurement/2024/pdf"
|
|
191
|
+
|
|
192
|
+
page = RelatonAsme::Fetcher::PublicationPage.new(url)
|
|
193
|
+
publication = page.to_data
|
|
194
|
+
|
|
195
|
+
if publication
|
|
196
|
+
publication.editions.each do |edition|
|
|
197
|
+
puts "Title: #{edition.title}"
|
|
198
|
+
puts "Year: #{edition.edition_year}"
|
|
199
|
+
puts "URL: #{edition.url}"
|
|
200
|
+
puts "Identifier: #{edition.document_identifier}"
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
----
|
|
204
|
+
|
|
205
|
+
This parses the JSON-LD structured data and may return multiple editions (e.g.,
|
|
206
|
+
2024 and 1974 versions) from a single page.
|
|
207
|
+
|
|
208
|
+
Example output:
|
|
209
|
+
----
|
|
210
|
+
Title: Temperature Measurement (2024)
|
|
211
|
+
Year: 2024
|
|
212
|
+
URL: https://www.asme.org/.../2024/pdf/
|
|
213
|
+
Identifier: PTC 19.3-2024
|
|
214
|
+
|
|
215
|
+
Title: Temperature Measurement (1974)
|
|
216
|
+
Year: 1974
|
|
217
|
+
URL: https://www.asme.org/.../1974/pdf/
|
|
218
|
+
Identifier: PTC 19.3-1974
|
|
219
|
+
----
|
|
220
|
+
====
|
|
221
|
+
|
|
222
|
+
[[batch-processing]]
|
|
223
|
+
=== Batch processing with Runner
|
|
224
|
+
|
|
225
|
+
==== General
|
|
226
|
+
|
|
227
|
+
The `Runner` class orchestrates batch fetching of publications from the API.
|
|
228
|
+
|
|
229
|
+
==== Fetching all publications
|
|
230
|
+
|
|
231
|
+
Syntax:
|
|
232
|
+
|
|
233
|
+
[source,ruby]
|
|
234
|
+
----
|
|
235
|
+
runner = RelatonAsme::Fetcher::Runner.new <1>
|
|
236
|
+
editions = runner.fetch_all(
|
|
237
|
+
max_pages: {max_pages}, <2>
|
|
238
|
+
per_page: {per_page} <3>
|
|
239
|
+
)
|
|
240
|
+
----
|
|
241
|
+
<1> Create a new runner instance
|
|
242
|
+
<2> Maximum number of API pages to fetch (nil for all pages)
|
|
243
|
+
<3> Number of publications per page (default: 100)
|
|
244
|
+
|
|
245
|
+
Where,
|
|
246
|
+
|
|
247
|
+
`max_pages`:: Optional limit on the number of API pages to process. Pass `nil` to fetch all 671 publications (135 pages).
|
|
248
|
+
`per_page`:: Number of publications to fetch per API page. Default is 100, maximum is 100.
|
|
249
|
+
`editions`:: Returns an array of `AsmePublicationEdition` objects from all fetched publications.
|
|
250
|
+
|
|
251
|
+
.Fetching publications from first 2 pages
|
|
252
|
+
[example]
|
|
253
|
+
====
|
|
254
|
+
[source,ruby]
|
|
255
|
+
----
|
|
256
|
+
runner = RelatonAsme::Fetcher::Runner.new
|
|
257
|
+
editions = runner.fetch_all(max_pages: 2, per_page: 10)
|
|
258
|
+
|
|
259
|
+
puts "Fetched #{editions.size} editions"
|
|
260
|
+
editions.first(3).each do |edition|
|
|
261
|
+
puts "#{edition.document_identifier} - #{edition.title}"
|
|
262
|
+
end
|
|
263
|
+
----
|
|
264
|
+
|
|
265
|
+
This fetches publications from the first 2 API pages and parses their JSON-LD data.
|
|
266
|
+
====
|
|
267
|
+
|
|
268
|
+
==== Fetching and saving to file
|
|
269
|
+
|
|
270
|
+
Syntax:
|
|
271
|
+
|
|
272
|
+
[source,ruby]
|
|
273
|
+
----
|
|
274
|
+
runner = RelatonAsme::Fetcher::Runner.new
|
|
275
|
+
runner.fetch_all_editions(
|
|
276
|
+
{output_file}, <1>
|
|
277
|
+
format: {format}, <2>
|
|
278
|
+
max_pages: {max_pages} <3>
|
|
279
|
+
)
|
|
280
|
+
----
|
|
281
|
+
<1> Path to output file
|
|
282
|
+
<2> Output format (`:json` or `:yaml`)
|
|
283
|
+
<3> Optional maximum number of pages
|
|
284
|
+
|
|
285
|
+
.Saving publications to YAML
|
|
286
|
+
[example]
|
|
287
|
+
====
|
|
288
|
+
[source,ruby]
|
|
289
|
+
----
|
|
290
|
+
runner = RelatonAsme::Fetcher::Runner.new
|
|
291
|
+
runner.fetch_all_editions(
|
|
292
|
+
"asme_publications.yml",
|
|
293
|
+
format: :yaml,
|
|
294
|
+
max_pages: 1
|
|
295
|
+
)
|
|
296
|
+
----
|
|
297
|
+
|
|
298
|
+
This fetches publications from the first API page, parses their JSON-LD data,
|
|
299
|
+
and saves them as YAML to `asme_publications.yml`.
|
|
300
|
+
====
|
|
301
|
+
|
|
302
|
+
[[data-serialization]]
|
|
303
|
+
=== Data serialization
|
|
304
|
+
|
|
305
|
+
==== General
|
|
306
|
+
|
|
307
|
+
All data models support JSON and YAML serialization through `lutaml-model`.
|
|
308
|
+
|
|
309
|
+
==== Serializing to JSON
|
|
310
|
+
|
|
311
|
+
.Converting an edition to JSON
|
|
312
|
+
[example]
|
|
313
|
+
====
|
|
314
|
+
[source,ruby]
|
|
315
|
+
----
|
|
316
|
+
edition = RelatonAsme::Fetcher::AsmePublicationEdition.new(
|
|
317
|
+
url: "https://www.asme.org/...",
|
|
318
|
+
document_type: "standard",
|
|
319
|
+
title: "BPVC Section I",
|
|
320
|
+
document_identifier: "BPVC.I-2025",
|
|
321
|
+
designator: "BPVC.I",
|
|
322
|
+
edition_year: "2025",
|
|
323
|
+
publisher: "ASME",
|
|
324
|
+
publish_date: "2025",
|
|
325
|
+
language: "EN"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
json = edition.to_json
|
|
329
|
+
puts json
|
|
330
|
+
----
|
|
331
|
+
|
|
332
|
+
Note: `edition_year` is a String, not an Integer.
|
|
333
|
+
====
|
|
334
|
+
|
|
335
|
+
[[pre-scraped-database]]
|
|
336
|
+
=== Pre-scraped database
|
|
337
|
+
|
|
338
|
+
==== General
|
|
339
|
+
|
|
340
|
+
A complete database of ASME publications has been pre-scraped and is available in `data/asme_publication_editions.yml`.
|
|
341
|
+
|
|
342
|
+
==== Database contents
|
|
343
|
+
|
|
344
|
+
The database contains:
|
|
345
|
+
|
|
346
|
+
* **4,928 ASME publication editions**
|
|
347
|
+
* **269 unique ASME publications**
|
|
348
|
+
* **Year range:** 1967 - 2025 (58 years)
|
|
349
|
+
* **Formats:** Print Book, PDF, Bundle
|
|
350
|
+
|
|
351
|
+
==== Loading the database
|
|
352
|
+
|
|
353
|
+
.Loading pre-scraped ASME data
|
|
354
|
+
[example]
|
|
355
|
+
====
|
|
356
|
+
[source,ruby]
|
|
357
|
+
----
|
|
358
|
+
require "yaml"
|
|
359
|
+
|
|
360
|
+
data = YAML.load_file(
|
|
361
|
+
"data/asme_publication_editions.yml",
|
|
362
|
+
permitted_classes: [Symbol]
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
puts "Loaded #{data.size} ASME editions"
|
|
366
|
+
|
|
367
|
+
# Find all editions for a specific designator
|
|
368
|
+
bpvc_editions = data.select { |e| e["designator"] == "BPVC.I" }
|
|
369
|
+
puts "BPVC.I has #{bpvc_editions.size} editions"
|
|
370
|
+
|
|
371
|
+
# Find editions by year
|
|
372
|
+
recent = data.select { |e| e["edition_year"].to_i >= 2020 }
|
|
373
|
+
puts "#{recent.size} editions from 2020 onwards"
|
|
374
|
+
----
|
|
375
|
+
====
|
|
376
|
+
|
|
377
|
+
== Data model details
|
|
378
|
+
|
|
379
|
+
=== AsmePublicationEdition (Primary Model)
|
|
380
|
+
|
|
381
|
+
The primary data model containing all publication metadata for a specific edition.
|
|
382
|
+
|
|
383
|
+
.Attributes
|
|
384
|
+
[cols="1,1,3"]
|
|
385
|
+
|===
|
|
386
|
+
|Attribute |Type |Description
|
|
387
|
+
|
|
388
|
+
|`url`
|
|
389
|
+
|String
|
|
390
|
+
|Direct link to publication page on asme.org
|
|
391
|
+
|
|
392
|
+
|`document_type`
|
|
393
|
+
|String
|
|
394
|
+
|Type of document (typically "standard")
|
|
395
|
+
|
|
396
|
+
|`title`
|
|
397
|
+
|String
|
|
398
|
+
|Edition title WITHOUT designator prefix, with year in parentheses
|
|
399
|
+
|
|
400
|
+
|`document_identifier`
|
|
401
|
+
|String
|
|
402
|
+
|Edition-specific identifier (e.g., "BPVC.I-2025")
|
|
403
|
+
|
|
404
|
+
|`designator`
|
|
405
|
+
|String
|
|
406
|
+
|ASME code/SKU (e.g., "BPVC.I", "B31.3", "Y14.5")
|
|
407
|
+
|
|
408
|
+
|`edition_year`
|
|
409
|
+
|String
|
|
410
|
+
|Publication year as string (e.g., "2025", "2023")
|
|
411
|
+
|
|
412
|
+
|`publisher`
|
|
413
|
+
|String
|
|
414
|
+
|Publisher name (always "ASME")
|
|
415
|
+
|
|
416
|
+
|`publish_date`
|
|
417
|
+
|String
|
|
418
|
+
|Publication date (typically same as edition_year)
|
|
419
|
+
|
|
420
|
+
|`language`
|
|
421
|
+
|String
|
|
422
|
+
|Language code (typically "EN")
|
|
423
|
+
|
|
424
|
+
|`number_of_pages`
|
|
425
|
+
|Integer
|
|
426
|
+
|Page count (optional, not available in JSON-LD)
|
|
427
|
+
|
|
428
|
+
|`description`
|
|
429
|
+
|String
|
|
430
|
+
|Detailed description (optional, not available in JSON-LD)
|
|
431
|
+
|
|
432
|
+
|`topics`
|
|
433
|
+
|Array<String>
|
|
434
|
+
|Array of topic names (optional, not available in JSON-LD)
|
|
435
|
+
|
|
436
|
+
|`formats`
|
|
437
|
+
|Array<AsmeFormat>
|
|
438
|
+
|Available formats and pricing from JSON-LD offers
|
|
439
|
+
|===
|
|
440
|
+
|
|
441
|
+
=== AsmeFormat
|
|
442
|
+
|
|
443
|
+
Represents a publication format with pricing information extracted from JSON-LD offers.
|
|
444
|
+
|
|
445
|
+
.Attributes
|
|
446
|
+
[cols="1,1,3"]
|
|
447
|
+
|===
|
|
448
|
+
|Attribute |Type |Description
|
|
449
|
+
|
|
450
|
+
|`format_type`
|
|
451
|
+
|String
|
|
452
|
+
|Format type (e.g., "PDF", "Print Book", "Bundle")
|
|
453
|
+
|
|
454
|
+
|`price`
|
|
455
|
+
|String
|
|
456
|
+
|Price formatted as string (e.g., "$675.00")
|
|
457
|
+
|
|
458
|
+
|`isbn`
|
|
459
|
+
|String
|
|
460
|
+
|ISBN number if available (optional, not in JSON-LD)
|
|
461
|
+
|===
|
|
462
|
+
|
|
463
|
+
=== AsmePublication (Container)
|
|
464
|
+
|
|
465
|
+
Container model for grouping related publication editions.
|
|
466
|
+
|
|
467
|
+
.Attributes
|
|
468
|
+
[cols="1,1,3"]
|
|
469
|
+
|===
|
|
470
|
+
|Attribute |Type |Description
|
|
471
|
+
|
|
472
|
+
|`publication_group`
|
|
473
|
+
|String
|
|
474
|
+
|Publication designator/SKU used for grouping
|
|
475
|
+
|
|
476
|
+
|`editions`
|
|
477
|
+
|Array<AsmePublicationEdition>
|
|
478
|
+
|Array of publication editions with different years/formats
|
|
479
|
+
|===
|
|
480
|
+
|
|
481
|
+
== Development
|
|
482
|
+
|
|
483
|
+
After checking out the repo, run `bundle install` to install dependencies.
|
|
484
|
+
|
|
485
|
+
Run tests:
|
|
486
|
+
|
|
487
|
+
[source,shell]
|
|
488
|
+
----
|
|
489
|
+
bundle exec rake spec
|
|
490
|
+
----
|
|
491
|
+
|
|
492
|
+
Run RuboCop:
|
|
493
|
+
|
|
494
|
+
[source,shell]
|
|
495
|
+
----
|
|
496
|
+
bundle exec rake rubocop
|
|
497
|
+
----
|
|
498
|
+
|
|
499
|
+
Run all checks (tests + RuboCop):
|
|
500
|
+
|
|
501
|
+
[source,shell]
|
|
502
|
+
----
|
|
503
|
+
bundle exec rake
|
|
504
|
+
----
|
|
505
|
+
|
|
506
|
+
== Scraping ASME Publications
|
|
507
|
+
|
|
508
|
+
To scrape ASME publications and save to a file:
|
|
509
|
+
|
|
510
|
+
[source,ruby]
|
|
511
|
+
----
|
|
512
|
+
require "relaton_asme"
|
|
513
|
+
|
|
514
|
+
runner = RelatonAsme::Fetcher::Runner.new
|
|
515
|
+
|
|
516
|
+
# Fetch all publications (671 total, takes ~2 hours)
|
|
517
|
+
runner.fetch_all_editions(
|
|
518
|
+
"data/asme_publications.yml",
|
|
519
|
+
format: :yaml
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Or fetch just a few pages for testing
|
|
523
|
+
runner.fetch_all_editions(
|
|
524
|
+
"data/sample.yml",
|
|
525
|
+
format: :yaml,
|
|
526
|
+
max_pages: 2
|
|
527
|
+
)
|
|
528
|
+
----
|
|
529
|
+
|
|
530
|
+
== Contributing
|
|
531
|
+
|
|
532
|
+
Bug reports and pull requests are welcome on GitHub at
|
|
533
|
+
https://github.com/relaton/relaton-asme.
|
|
534
|
+
|
|
535
|
+
== Copyright & license
|
|
536
|
+
|
|
537
|
+
Copyright Ribose.
|
|
538
|
+
|
|
539
|
+
The gem is available as open source under the terms of the BSD-2-Clause License.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
|
|
5
|
+
module RelatonAsme
|
|
6
|
+
module Fetcher
|
|
7
|
+
# Represents a publication format (e.g., PDF, Print Book, Bundle)
|
|
8
|
+
class AsmeFormat < Lutaml::Model::Serializable
|
|
9
|
+
attribute :format_type, :string
|
|
10
|
+
attribute :price, :string
|
|
11
|
+
attribute :isbn, :string
|
|
12
|
+
|
|
13
|
+
json do
|
|
14
|
+
map "format_type", to: :format_type
|
|
15
|
+
map "price", to: :price
|
|
16
|
+
map "isbn", to: :isbn
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
yaml do
|
|
20
|
+
map "format_type", to: :format_type
|
|
21
|
+
map "price", to: :price
|
|
22
|
+
map "isbn", to: :isbn
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
require_relative "asme_publication_edition"
|
|
5
|
+
|
|
6
|
+
module RelatonAsme
|
|
7
|
+
module Fetcher
|
|
8
|
+
# Container model for grouping related publication editions
|
|
9
|
+
class AsmePublication < Lutaml::Model::Serializable
|
|
10
|
+
attribute :publication_group, :string
|
|
11
|
+
attribute :editions, AsmePublicationEdition, collection: true
|
|
12
|
+
|
|
13
|
+
json do
|
|
14
|
+
map "publication_group", to: :publication_group
|
|
15
|
+
map "editions", to: :editions
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
yaml do
|
|
19
|
+
map "publication_group", to: :publication_group
|
|
20
|
+
map "editions", to: :editions
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "lutaml/model"
|
|
4
|
+
require_relative "asme_format"
|
|
5
|
+
|
|
6
|
+
module RelatonAsme
|
|
7
|
+
module Fetcher
|
|
8
|
+
# Represents a specific edition of an ASME publication
|
|
9
|
+
# This is the PRIMARY data model containing all metadata
|
|
10
|
+
class AsmePublicationEdition < Lutaml::Model::Serializable
|
|
11
|
+
attribute :url, :string
|
|
12
|
+
attribute :document_type, :string
|
|
13
|
+
attribute :title, :string
|
|
14
|
+
attribute :document_identifier, :string
|
|
15
|
+
attribute :designator, :string
|
|
16
|
+
attribute :edition_year, :string
|
|
17
|
+
attribute :publisher, :string
|
|
18
|
+
attribute :publish_date, :string
|
|
19
|
+
attribute :language, :string
|
|
20
|
+
attribute :number_of_pages, :integer
|
|
21
|
+
attribute :description, :string
|
|
22
|
+
attribute :topics, :string, collection: true
|
|
23
|
+
attribute :formats, AsmeFormat, collection: true
|
|
24
|
+
|
|
25
|
+
json do
|
|
26
|
+
map "url", to: :url
|
|
27
|
+
map "document_type", to: :document_type
|
|
28
|
+
map "title", to: :title
|
|
29
|
+
map "document_identifier", to: :document_identifier
|
|
30
|
+
map "designator", to: :designator
|
|
31
|
+
map "edition_year", to: :edition_year
|
|
32
|
+
map "publisher", to: :publisher
|
|
33
|
+
map "publish_date", to: :publish_date
|
|
34
|
+
map "language", to: :language
|
|
35
|
+
map "number_of_pages", to: :number_of_pages
|
|
36
|
+
map "description", to: :description
|
|
37
|
+
map "topics", to: :topics
|
|
38
|
+
map "formats", to: :formats
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
yaml do
|
|
42
|
+
map "url", to: :url
|
|
43
|
+
map "document_type", to: :document_type
|
|
44
|
+
map "title", to: :title
|
|
45
|
+
map "document_identifier", to: :document_identifier
|
|
46
|
+
map "designator", to: :designator
|
|
47
|
+
map "edition_year", to: :edition_year
|
|
48
|
+
map "publisher", to: :publisher
|
|
49
|
+
map "publish_date", to: :publish_date
|
|
50
|
+
map "language", to: :language
|
|
51
|
+
map "number_of_pages", to: :number_of_pages
|
|
52
|
+
map "description", to: :description
|
|
53
|
+
map "topics", to: :topics
|
|
54
|
+
map "formats", to: :formats
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "mechanize"
|
|
4
|
+
|
|
5
|
+
module RelatonAsme
|
|
6
|
+
module Fetcher
|
|
7
|
+
# Base class for web scraping pages using Mechanize
|
|
8
|
+
class BasicPage
|
|
9
|
+
attr_reader :url, :page, :agent
|
|
10
|
+
|
|
11
|
+
# Initialize a new page fetcher
|
|
12
|
+
#
|
|
13
|
+
# @param url [String] URL to fetch
|
|
14
|
+
# @param agent [Mechanize, nil] Optional Mechanize agent to reuse
|
|
15
|
+
def initialize(url, agent: nil)
|
|
16
|
+
@url = url
|
|
17
|
+
@agent = agent || create_agent
|
|
18
|
+
@page = nil
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Fetch the page
|
|
22
|
+
#
|
|
23
|
+
# @return [Mechanize::Page] The fetched page
|
|
24
|
+
def fetch
|
|
25
|
+
@page = @agent.get(url)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Get the page, fetching if necessary
|
|
29
|
+
#
|
|
30
|
+
# @return [Mechanize::Page] The page
|
|
31
|
+
def ensure_page
|
|
32
|
+
@ensure_page ||= fetch
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
# Create a new Mechanize agent with standard configuration
|
|
38
|
+
#
|
|
39
|
+
# @return [Mechanize] Configured agent
|
|
40
|
+
def create_agent
|
|
41
|
+
agent = Mechanize.new
|
|
42
|
+
agent.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) " \
|
|
43
|
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) " \
|
|
44
|
+
"Version/17.2 Safari/605.1.15"
|
|
45
|
+
agent.follow_meta_refresh = true
|
|
46
|
+
agent.redirect_ok = true
|
|
47
|
+
agent
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require_relative "basic_page"
|
|
5
|
+
|
|
6
|
+
module RelatonAsme
|
|
7
|
+
module Fetcher
|
|
8
|
+
# Handles fetching ASME standards index via JSON API
|
|
9
|
+
class IndexPage < BasicPage
|
|
10
|
+
API_URL = "https://www.asme.org/api/products"
|
|
11
|
+
DEFAULT_PER_PAGE = 100
|
|
12
|
+
|
|
13
|
+
# Initialize index page for a specific page number
|
|
14
|
+
#
|
|
15
|
+
# @param page_number [Integer] Page number (1-based)
|
|
16
|
+
# @param per_page [Integer] Results per page
|
|
17
|
+
# @param agent [Mechanize, nil] Optional Mechanize agent
|
|
18
|
+
def initialize(page_number: 1, per_page: DEFAULT_PER_PAGE, agent: nil)
|
|
19
|
+
@page_number = page_number
|
|
20
|
+
@per_page = per_page
|
|
21
|
+
url = build_api_url(page_number, per_page)
|
|
22
|
+
super(url, agent: agent)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Extract publication URLs from the API response
|
|
26
|
+
#
|
|
27
|
+
# @return [Array<String>] Array of publication URLs
|
|
28
|
+
def publication_urls
|
|
29
|
+
ensure_page
|
|
30
|
+
extract_urls_from_json
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Check if there are more pages
|
|
34
|
+
#
|
|
35
|
+
# @return [Boolean] True if more pages exist
|
|
36
|
+
def more_pages?
|
|
37
|
+
ensure_page
|
|
38
|
+
data = parse_json_response
|
|
39
|
+
return false unless data
|
|
40
|
+
|
|
41
|
+
data["page"] < data["pages"]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Build the API URL with query parameters
|
|
47
|
+
#
|
|
48
|
+
# @param page_number [Integer] Page number
|
|
49
|
+
# @param per_page [Integer] Results per page
|
|
50
|
+
# @return [String] Full API URL
|
|
51
|
+
def build_api_url(page_number, per_page)
|
|
52
|
+
params = [
|
|
53
|
+
"type=Codes-Standards",
|
|
54
|
+
"page=#{page_number}",
|
|
55
|
+
"perPage=#{per_page}",
|
|
56
|
+
"sortBy=date",
|
|
57
|
+
"sortByDir=desc"
|
|
58
|
+
].join("&")
|
|
59
|
+
|
|
60
|
+
"#{API_URL}?#{params}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Parse JSON response from API
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash, nil] Parsed JSON data or nil
|
|
66
|
+
def parse_json_response
|
|
67
|
+
JSON.parse(page.body)
|
|
68
|
+
rescue JSON::ParserError => e
|
|
69
|
+
warn "Failed to parse JSON response: #{e.message}"
|
|
70
|
+
nil
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Extract publication URLs from JSON response
|
|
74
|
+
#
|
|
75
|
+
# @return [Array<String>] Publication URLs
|
|
76
|
+
def extract_urls_from_json
|
|
77
|
+
data = parse_json_response
|
|
78
|
+
return [] unless data && data["results"]
|
|
79
|
+
|
|
80
|
+
data["results"].map do |result|
|
|
81
|
+
build_publication_urls(result)
|
|
82
|
+
end.flatten.compact.uniq
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Build publication URLs from result data
|
|
86
|
+
#
|
|
87
|
+
# @param result [Hash] Result data from API
|
|
88
|
+
# @return [Array<String>] URLs for each edition/format combination
|
|
89
|
+
def build_publication_urls(result)
|
|
90
|
+
base_url = result.dig("WebsiteProductGrouping", "URL")
|
|
91
|
+
return [] unless base_url
|
|
92
|
+
|
|
93
|
+
editions = result.dig("WebsiteProductGrouping", "Editions") || []
|
|
94
|
+
formats = result.dig("WebsiteProductGrouping", "Formats") || []
|
|
95
|
+
|
|
96
|
+
# Generate URLs for each edition/format combination
|
|
97
|
+
urls = []
|
|
98
|
+
editions.each do |edition|
|
|
99
|
+
formats.each do |format|
|
|
100
|
+
format_path = format_to_path(format)
|
|
101
|
+
url = "https://www.asme.org#{base_url}/#{edition}/#{format_path}"
|
|
102
|
+
urls << url
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
urls
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Convert format name to URL path component
|
|
110
|
+
#
|
|
111
|
+
# @param format [String] Format name
|
|
112
|
+
# @return [String] URL path component
|
|
113
|
+
def format_to_path(format)
|
|
114
|
+
case format.downcase
|
|
115
|
+
when /print.*book/
|
|
116
|
+
"print-book"
|
|
117
|
+
when /pdf/
|
|
118
|
+
"pdf"
|
|
119
|
+
when /bundle/
|
|
120
|
+
"bundle"
|
|
121
|
+
else
|
|
122
|
+
format.downcase.tr(" ", "-")
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require_relative "basic_page"
|
|
5
|
+
require_relative "asme_publication"
|
|
6
|
+
require_relative "asme_publication_edition"
|
|
7
|
+
require_relative "asme_format"
|
|
8
|
+
|
|
9
|
+
module RelatonAsme
|
|
10
|
+
module Fetcher
|
|
11
|
+
# Handles scraping of individual ASME publication pages
|
|
12
|
+
class PublicationPage < BasicPage
|
|
13
|
+
# Convert page data to AsmePublication model
|
|
14
|
+
#
|
|
15
|
+
# @return [AsmePublication, nil] Publication with editions or nil
|
|
16
|
+
def to_data
|
|
17
|
+
ensure_page
|
|
18
|
+
jsonld_data = parse_jsonld
|
|
19
|
+
|
|
20
|
+
return nil unless jsonld_data
|
|
21
|
+
|
|
22
|
+
build_publication_from_jsonld(jsonld_data)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
# Parse JSON-LD structured data from page
|
|
28
|
+
#
|
|
29
|
+
# @return [Hash, nil] Parsed JSON-LD data or nil
|
|
30
|
+
def parse_jsonld
|
|
31
|
+
jsonld_script = page.search('script[type="application/ld+json"]').first
|
|
32
|
+
return nil unless jsonld_script
|
|
33
|
+
|
|
34
|
+
JSON.parse(jsonld_script.text)
|
|
35
|
+
rescue JSON::ParserError => e
|
|
36
|
+
warn "Failed to parse JSON-LD: #{e.message}"
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Build AsmePublication from JSON-LD data
|
|
41
|
+
#
|
|
42
|
+
# @param jsonld_data [Hash] JSON-LD data
|
|
43
|
+
# @return [AsmePublication] Publication with editions
|
|
44
|
+
def build_publication_from_jsonld(jsonld_data)
|
|
45
|
+
offers = jsonld_data["offers"] || []
|
|
46
|
+
base_title = jsonld_data["name"]
|
|
47
|
+
designator = jsonld_data["sku"]
|
|
48
|
+
|
|
49
|
+
editions = offers.map do |offer|
|
|
50
|
+
build_edition_from_offer(offer, base_title, designator)
|
|
51
|
+
end.compact
|
|
52
|
+
|
|
53
|
+
AsmePublication.new(
|
|
54
|
+
publication_group: designator,
|
|
55
|
+
editions: editions
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Build edition from a JSON-LD offer
|
|
60
|
+
#
|
|
61
|
+
# @param offer [Hash] Offer data
|
|
62
|
+
# @param base_title [String] Base publication title
|
|
63
|
+
# @param designator [String] Publication designator/SKU
|
|
64
|
+
# @return [AsmePublicationEdition, nil] Edition or nil
|
|
65
|
+
def build_edition_from_offer(offer, base_title, designator)
|
|
66
|
+
url = offer["url"]
|
|
67
|
+
return nil unless url
|
|
68
|
+
|
|
69
|
+
# Extract year and format from URL
|
|
70
|
+
year = extract_year_from_url(url)
|
|
71
|
+
format_type = extract_format_from_url(url)
|
|
72
|
+
|
|
73
|
+
return nil unless year
|
|
74
|
+
|
|
75
|
+
# Build title without designator prefix, just base title and year
|
|
76
|
+
title = "#{base_title} (#{year})"
|
|
77
|
+
document_identifier = "#{designator}-#{year}"
|
|
78
|
+
|
|
79
|
+
AsmePublicationEdition.new(
|
|
80
|
+
url: url,
|
|
81
|
+
document_type: "standard",
|
|
82
|
+
title: title,
|
|
83
|
+
document_identifier: document_identifier,
|
|
84
|
+
designator: designator,
|
|
85
|
+
edition_year: year.to_s,
|
|
86
|
+
publisher: "ASME",
|
|
87
|
+
publish_date: year.to_s,
|
|
88
|
+
language: "EN",
|
|
89
|
+
formats: [build_format_from_offer(offer, format_type)]
|
|
90
|
+
)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Extract year from URL
|
|
94
|
+
#
|
|
95
|
+
# @param url [String] URL
|
|
96
|
+
# @return [Integer, nil] Year or nil
|
|
97
|
+
def extract_year_from_url(url)
|
|
98
|
+
match = url.match(%r{/(\d{4})/})
|
|
99
|
+
match ? match[1].to_i : nil
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Extract format type from URL
|
|
103
|
+
#
|
|
104
|
+
# @param url [String] URL
|
|
105
|
+
# @return [String] Format type
|
|
106
|
+
def extract_format_from_url(url)
|
|
107
|
+
return "PDF" if url.include?("/pdf")
|
|
108
|
+
return "Print Book" if url.include?("/print")
|
|
109
|
+
return "Bundle" if url.include?("/bundle")
|
|
110
|
+
|
|
111
|
+
"PDF"
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Build format from offer data
|
|
115
|
+
#
|
|
116
|
+
# @param offer [Hash] Offer data
|
|
117
|
+
# @param format_type [String] Format type
|
|
118
|
+
# @return [AsmeFormat] Format object
|
|
119
|
+
def build_format_from_offer(offer, format_type)
|
|
120
|
+
price = offer["price"]
|
|
121
|
+
price_str = "$#{price}.00"
|
|
122
|
+
|
|
123
|
+
AsmeFormat.new(
|
|
124
|
+
format_type: format_type,
|
|
125
|
+
price: price_str
|
|
126
|
+
)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "index_page"
|
|
4
|
+
require_relative "publication_page"
|
|
5
|
+
|
|
6
|
+
module RelatonAsme
|
|
7
|
+
module Fetcher
|
|
8
|
+
# Orchestrates the fetching of ASME publications
|
|
9
|
+
class Runner
|
|
10
|
+
attr_reader :agent
|
|
11
|
+
|
|
12
|
+
# Initialize the runner
|
|
13
|
+
#
|
|
14
|
+
# @param agent [Mechanize, nil] Optional Mechanize agent to reuse
|
|
15
|
+
def initialize(agent: nil)
|
|
16
|
+
@agent = agent || create_agent
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Fetch all publications from all pages
|
|
20
|
+
#
|
|
21
|
+
# @param max_pages [Integer, nil] Maximum pages to fetch (nil = all)
|
|
22
|
+
# @param per_page [Integer] Results per page
|
|
23
|
+
# @return [Array<AsmePublicationEdition>] All editions
|
|
24
|
+
def fetch_all(max_pages: nil, per_page: 100)
|
|
25
|
+
editions = []
|
|
26
|
+
page_number = 1
|
|
27
|
+
|
|
28
|
+
loop do
|
|
29
|
+
puts "Fetching page #{page_number}..."
|
|
30
|
+
page_editions = fetch_page(page_number, per_page: per_page)
|
|
31
|
+
editions.concat(page_editions)
|
|
32
|
+
|
|
33
|
+
break if max_pages && page_number >= max_pages
|
|
34
|
+
break unless more_pages?(page_number, per_page)
|
|
35
|
+
|
|
36
|
+
page_number += 1
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
editions
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Fetch publications from a specific page
|
|
43
|
+
#
|
|
44
|
+
# @param page_number [Integer] Page number to fetch
|
|
45
|
+
# @param per_page [Integer] Results per page
|
|
46
|
+
# @return [Array<AsmePublicationEdition>] Editions from the page
|
|
47
|
+
def fetch_page(page_number, per_page: 100)
|
|
48
|
+
index = IndexPage.new(
|
|
49
|
+
page_number: page_number,
|
|
50
|
+
per_page: per_page,
|
|
51
|
+
agent: @agent
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
urls = index.publication_urls
|
|
55
|
+
puts "Found #{urls.size} publications on page #{page_number}"
|
|
56
|
+
|
|
57
|
+
fetch_publications(urls)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Fetch multiple publications by URLs
|
|
61
|
+
#
|
|
62
|
+
# @param urls [Array<String>] Publication URLs
|
|
63
|
+
# @return [Array<AsmePublicationEdition>] All editions
|
|
64
|
+
def fetch_publications(urls)
|
|
65
|
+
editions = []
|
|
66
|
+
|
|
67
|
+
urls.each_with_index do |url, index|
|
|
68
|
+
puts "Fetching publication #{index + 1}/#{urls.size}: #{url}"
|
|
69
|
+
|
|
70
|
+
begin
|
|
71
|
+
publication = fetch_publication(url)
|
|
72
|
+
editions.concat(publication.editions) if publication&.editions
|
|
73
|
+
rescue StandardError => e
|
|
74
|
+
warn "Failed to fetch #{url}: #{e.message}"
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
editions
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Fetch a single publication by URL
|
|
82
|
+
#
|
|
83
|
+
# @param url [String] Publication URL
|
|
84
|
+
# @return [AsmePublication, nil] Publication with editions
|
|
85
|
+
def fetch_publication(url)
|
|
86
|
+
page = PublicationPage.new(url, agent: @agent)
|
|
87
|
+
page.to_data
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Check if more pages exist
|
|
91
|
+
#
|
|
92
|
+
# @param page_number [Integer] Current page number
|
|
93
|
+
# @param per_page [Integer] Results per page
|
|
94
|
+
# @return [Boolean] True if more pages exist
|
|
95
|
+
def more_pages?(page_number, per_page)
|
|
96
|
+
index = IndexPage.new(
|
|
97
|
+
page_number: page_number,
|
|
98
|
+
per_page: per_page,
|
|
99
|
+
agent: @agent
|
|
100
|
+
)
|
|
101
|
+
index.more_pages?
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Fetch all editions for publications and save to file
|
|
105
|
+
#
|
|
106
|
+
# @param output_file [String] Path to output file
|
|
107
|
+
# @param format [Symbol] Output format (:json or :yaml)
|
|
108
|
+
# @param max_pages [Integer, nil] Maximum pages to fetch
|
|
109
|
+
# @return [Array<AsmePublicationEdition>] All editions
|
|
110
|
+
def fetch_all_editions(output_file, format: :json, max_pages: nil)
|
|
111
|
+
editions = fetch_all(max_pages: max_pages)
|
|
112
|
+
|
|
113
|
+
save_editions(editions, output_file, format)
|
|
114
|
+
|
|
115
|
+
editions
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Save editions to file
|
|
119
|
+
#
|
|
120
|
+
# @param editions [Array<AsmePublicationEdition>] Editions to save
|
|
121
|
+
# @param output_file [String] Path to output file
|
|
122
|
+
# @param format [Symbol] Output format (:json or :yaml)
|
|
123
|
+
def save_editions(editions, output_file, format)
|
|
124
|
+
content = case format
|
|
125
|
+
when :json
|
|
126
|
+
serialize_to_json(editions)
|
|
127
|
+
when :yaml
|
|
128
|
+
serialize_to_yaml(editions)
|
|
129
|
+
else
|
|
130
|
+
raise ArgumentError, "Unsupported format: #{format}"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
File.write(output_file, content)
|
|
134
|
+
puts "Saved #{editions.size} editions to #{output_file}"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
private
|
|
138
|
+
|
|
139
|
+
# Create a new Mechanize agent
|
|
140
|
+
#
|
|
141
|
+
# @return [Mechanize] Configured agent
|
|
142
|
+
def create_agent
|
|
143
|
+
agent = Mechanize.new
|
|
144
|
+
agent.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) " \
|
|
145
|
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) " \
|
|
146
|
+
"Version/17.2 Safari/605.1.15"
|
|
147
|
+
agent.follow_meta_refresh = true
|
|
148
|
+
agent.redirect_ok = true
|
|
149
|
+
agent
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Serialize editions to JSON
|
|
153
|
+
#
|
|
154
|
+
# @param editions [Array<AsmePublicationEdition>] Editions
|
|
155
|
+
# @return [String] JSON string
|
|
156
|
+
def serialize_to_json(editions)
|
|
157
|
+
JSON.pretty_generate(editions.map(&:to_hash))
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Serialize editions to YAML
|
|
161
|
+
#
|
|
162
|
+
# @param editions [Array<AsmePublicationEdition>] Editions
|
|
163
|
+
# @return [String] YAML string
|
|
164
|
+
def serialize_to_yaml(editions)
|
|
165
|
+
editions.map(&:to_hash).to_yaml
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
data/lib/relaton_asme.rb
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "relaton_asme/version"
|
|
4
|
+
require_relative "relaton_asme/fetcher/asme_format"
|
|
5
|
+
require_relative "relaton_asme/fetcher/asme_publication_edition"
|
|
6
|
+
require_relative "relaton_asme/fetcher/asme_publication"
|
|
7
|
+
require_relative "relaton_asme/fetcher/basic_page"
|
|
8
|
+
require_relative "relaton_asme/fetcher/index_page"
|
|
9
|
+
require_relative "relaton_asme/fetcher/publication_page"
|
|
10
|
+
require_relative "relaton_asme/fetcher/runner"
|
|
11
|
+
|
|
12
|
+
module RelatonAsme
|
|
13
|
+
class Error < StandardError; end
|
|
14
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: relaton-asme
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ribose Inc.
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2025-12-15 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: lutaml-model
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.7'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.7'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: mechanize
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ">="
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ">="
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '0'
|
|
41
|
+
description: Retrieve bibliographic information of ASME standards and codes.
|
|
42
|
+
email:
|
|
43
|
+
- open.source@ribose.com
|
|
44
|
+
executables: []
|
|
45
|
+
extensions: []
|
|
46
|
+
extra_rdoc_files: []
|
|
47
|
+
files:
|
|
48
|
+
- LICENSE
|
|
49
|
+
- README.adoc
|
|
50
|
+
- lib/relaton_asme.rb
|
|
51
|
+
- lib/relaton_asme/fetcher/asme_format.rb
|
|
52
|
+
- lib/relaton_asme/fetcher/asme_publication.rb
|
|
53
|
+
- lib/relaton_asme/fetcher/asme_publication_edition.rb
|
|
54
|
+
- lib/relaton_asme/fetcher/basic_page.rb
|
|
55
|
+
- lib/relaton_asme/fetcher/index_page.rb
|
|
56
|
+
- lib/relaton_asme/fetcher/publication_page.rb
|
|
57
|
+
- lib/relaton_asme/fetcher/runner.rb
|
|
58
|
+
- lib/relaton_asme/version.rb
|
|
59
|
+
homepage: https://github.com/relaton/relaton-asme
|
|
60
|
+
licenses:
|
|
61
|
+
- BSD-2-Clause
|
|
62
|
+
metadata:
|
|
63
|
+
homepage_uri: https://github.com/relaton/relaton-asme
|
|
64
|
+
source_code_uri: https://github.com/relaton/relaton-asme
|
|
65
|
+
rubygems_mfa_required: 'true'
|
|
66
|
+
post_install_message:
|
|
67
|
+
rdoc_options: []
|
|
68
|
+
require_paths:
|
|
69
|
+
- lib
|
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 3.0.0
|
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: '0'
|
|
80
|
+
requirements: []
|
|
81
|
+
rubygems_version: 3.5.22
|
|
82
|
+
signing_key:
|
|
83
|
+
specification_version: 4
|
|
84
|
+
summary: Fetch and parse ASME standards and codes
|
|
85
|
+
test_files: []
|