relaton-ieee 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/lib/relaton/ieee/bibliography.rb +1 -1
- data/lib/relaton/ieee/data_fetcher.rb +466 -33
- data/lib/relaton/ieee/idams_parser.rb +10 -1
- data/lib/relaton/ieee/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5a2e8b9dc4134eba7c8047ca10a447fa89a6cad9df60006130e45fff3f8a917c
|
|
4
|
+
data.tar.gz: 2a3b2d228652b90e6db5bc9bc77b36d6a07bb51ddf38b211c2a8ecb17b85de97
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cfca04d99a0f3c8eb8ca20168cbb9d3c36ca6806db8537c604f86bd9d21419c44d67250c825ea6a1e4001a1969931c8db69bb320829b3e074e7f3eb259df3861
|
|
7
|
+
data.tar.gz: 2575d47ae68327ba17c780f986a0a0a50511b73bbb459a23a95aada71771f845181da15f63bd9d01ab6ce133c275641afd81628c3e1a84f67e768b710499de4b
|
data/Rakefile
CHANGED
|
@@ -11,7 +11,7 @@ namespace :spec do
|
|
|
11
11
|
require "net/http"
|
|
12
12
|
require "uri"
|
|
13
13
|
|
|
14
|
-
url = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/
|
|
14
|
+
url = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/v2/index-v1.zip"
|
|
15
15
|
dest = File.join(__dir__, "spec", "fixtures", "index-v1.zip")
|
|
16
16
|
|
|
17
17
|
puts "Downloading \#{url} ..."
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module Relaton
|
|
2
2
|
module Ieee
|
|
3
3
|
class Bibliography
|
|
4
|
-
GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/refs/heads/
|
|
4
|
+
GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/refs/heads/v2/".freeze
|
|
5
5
|
|
|
6
6
|
class << self
|
|
7
7
|
#
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
require "etc"
|
|
1
2
|
require "zip"
|
|
2
3
|
require_relative "../ieee"
|
|
3
4
|
require_relative "converter/bibxml"
|
|
@@ -26,17 +27,10 @@ module Relaton
|
|
|
26
27
|
Util.error msg
|
|
27
28
|
end
|
|
28
29
|
|
|
29
|
-
def fetch(_source = nil)
|
|
30
|
-
Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
when ".xml" then File.read f, encoding: "UTF-8"
|
|
34
|
-
end
|
|
35
|
-
fetch_doc xml, f
|
|
36
|
-
rescue StandardError => e
|
|
37
|
-
Util.error "File: #{f}\n#{e.message}\n#{e.backtrace}"
|
|
38
|
-
end
|
|
39
|
-
# File.write "normtitles.txt", @normtitles.join("\n")
|
|
30
|
+
def fetch(_source = nil)
|
|
31
|
+
files = Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }
|
|
32
|
+
files = prefilter_winners(files) unless ENV["IEEE_FETCH_PREFILTER"] == "0"
|
|
33
|
+
process_files(files)
|
|
40
34
|
update_relations
|
|
41
35
|
report_errors
|
|
42
36
|
end
|
|
@@ -46,8 +40,27 @@ module Relaton
|
|
|
46
40
|
@backrefs ||= {}
|
|
47
41
|
end
|
|
48
42
|
|
|
43
|
+
# @return [Hash] list of docnumber => parsed bib (cache for update_relations)
|
|
44
|
+
def docs
|
|
45
|
+
@docs ||= {}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# @return [Hash] docnumber => max global glob-index whose write was
|
|
49
|
+
# accepted by commit_doc. Populated only when running with parallel
|
|
50
|
+
# workers (writes are staged to per-glob-index suffixed paths and
|
|
51
|
+
# reconciled into the final filename after the parsing phase).
|
|
52
|
+
def saved_writes
|
|
53
|
+
@saved_writes ||= {}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Mutex guarding worker-thread mutations of shared state during parse.
|
|
57
|
+
def mutex
|
|
58
|
+
@mutex ||= Mutex.new
|
|
59
|
+
end
|
|
60
|
+
|
|
49
61
|
#
|
|
50
|
-
# Save unresolved relation reference
|
|
62
|
+
# Save unresolved relation reference. Called from worker threads via
|
|
63
|
+
# IdamsParser#parse_relation, so mutates crossrefs under a mutex.
|
|
51
64
|
#
|
|
52
65
|
# @param [String] docnumber of main document
|
|
53
66
|
# @param [Nokogiri::XML::Element] amsid relation data
|
|
@@ -56,7 +69,7 @@ module Relaton
|
|
|
56
69
|
return if RELATION_TYPES[amsid.type] == false
|
|
57
70
|
|
|
58
71
|
ref = { amsid: amsid.date_string, type: amsid.type }
|
|
59
|
-
crossrefs[docnumber] << ref
|
|
72
|
+
mutex.synchronize { crossrefs[docnumber] << ref }
|
|
60
73
|
end
|
|
61
74
|
|
|
62
75
|
#
|
|
@@ -109,47 +122,463 @@ module Relaton
|
|
|
109
122
|
end
|
|
110
123
|
|
|
111
124
|
#
|
|
112
|
-
#
|
|
125
|
+
# Pre-filter the input file list down to the subset that actually
|
|
126
|
+
# has to be fully parsed.
|
|
127
|
+
#
|
|
128
|
+
# The IEEE rawbib dataset has ~50× duplication: every docnumber
|
|
129
|
+
# appears in `cache/` plus most `updates.YYYYMMDD/` folders. The
|
|
130
|
+
# original semantic is "latest update wins on disk", so for any
|
|
131
|
+
# docnumber that has at least one updates-folder file, the cache
|
|
132
|
+
# file's parse result is just thrown away. Pre-filter avoids
|
|
133
|
+
# parsing those throwaway files entirely.
|
|
134
|
+
#
|
|
135
|
+
# The cheap path here only has to extract three small XML elements
|
|
136
|
+
# (normtitle, stdnumber, standard_id) per file — done with
|
|
137
|
+
# regex on the raw XML so we skip lutaml-model's heavy DOM-to-
|
|
138
|
+
# object construction (which is what dominates fetch time).
|
|
139
|
+
#
|
|
140
|
+
# Selection rules:
|
|
141
|
+
# - For each docnumber with any updates-folder entry: keep only
|
|
142
|
+
# the highest-glob-idx updates-folder file.
|
|
143
|
+
# - For docnumbers with cache-folder entries only: keep all
|
|
144
|
+
# of them (commit_doc's matches-stdnumber dedup handles them).
|
|
145
|
+
# - Files where the cheap parse couldn't compute a docnumber
|
|
146
|
+
# are kept as-is — the full parse will surface any real error.
|
|
147
|
+
#
|
|
148
|
+
# Disable with IEEE_FETCH_PREFILTER=0.
|
|
149
|
+
#
|
|
150
|
+
def prefilter_winners(files)
|
|
151
|
+
threshold = Integer(ENV["IEEE_FETCH_PREFILTER_MIN"] || 200)
|
|
152
|
+
return files if files.size < threshold
|
|
153
|
+
|
|
154
|
+
procs = Integer(ENV["IEEE_FETCH_PROCESSES"] || Etc.nprocessors)
|
|
155
|
+
index = procs <= 1 ? prefilter_serial(files) : prefilter_parallel(files, procs)
|
|
156
|
+
select_prefilter_winners(index, files.size)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def prefilter_serial(files)
|
|
160
|
+
files.each_with_index.map { |f, i| extract_index_entry(i, f) }.compact
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def prefilter_parallel(files, procs) # rubocop:disable Metrics/MethodLength
|
|
164
|
+
batch_size = Integer(ENV["IEEE_PREFILTER_BATCH"] || 5000)
|
|
165
|
+
batches = files.each_slice(batch_size).each_with_index.to_a
|
|
166
|
+
|
|
167
|
+
next_batch = 0
|
|
168
|
+
inflight = {}
|
|
169
|
+
collected = []
|
|
170
|
+
|
|
171
|
+
procs.times do
|
|
172
|
+
break if next_batch >= batches.size
|
|
173
|
+
|
|
174
|
+
inflight.merge!(spawn_prefilter_batch(*batches[next_batch], batch_size))
|
|
175
|
+
next_batch += 1
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
until inflight.empty?
|
|
179
|
+
pid = Process.wait
|
|
180
|
+
collected << inflight.delete(pid)
|
|
181
|
+
|
|
182
|
+
if next_batch < batches.size
|
|
183
|
+
inflight.merge!(spawn_prefilter_batch(*batches[next_batch], batch_size))
|
|
184
|
+
next_batch += 1
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
index = []
|
|
189
|
+
collected.each do |path|
|
|
190
|
+
next unless path && File.exist?(path) && File.size(path).positive?
|
|
191
|
+
|
|
192
|
+
index.concat(Marshal.load(File.binread(path)))
|
|
193
|
+
File.unlink(path)
|
|
194
|
+
end
|
|
195
|
+
index
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def spawn_prefilter_batch(batch_files, batch_idx, batch_size)
|
|
199
|
+
require "tmpdir"
|
|
200
|
+
require "securerandom"
|
|
201
|
+
state_path = File.join(
|
|
202
|
+
Dir.tmpdir,
|
|
203
|
+
"ieee_prefilter_#{Process.pid}_#{batch_idx}_#{SecureRandom.hex(4)}.bin",
|
|
204
|
+
)
|
|
205
|
+
base_idx = batch_idx * batch_size
|
|
206
|
+
|
|
207
|
+
pid = Process.fork do
|
|
208
|
+
entries = batch_files.each_with_index.map do |file, i|
|
|
209
|
+
extract_index_entry(base_idx + i, file)
|
|
210
|
+
end.compact
|
|
211
|
+
File.binwrite(state_path, Marshal.dump(entries))
|
|
212
|
+
exit!(0)
|
|
213
|
+
end
|
|
214
|
+
{ pid => state_path }
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
#
|
|
218
|
+
# Cheap-parse one file: read XML, regex-extract three fields,
|
|
219
|
+
# compute docnumber via the existing RawbibIdParser. Returns
|
|
220
|
+
# `[glob_idx, file, docnumber_or_nil, in_updates_folder?]`.
|
|
221
|
+
#
|
|
222
|
+
def extract_index_entry(idx, file)
|
|
223
|
+
xml = case File.extname(file)
|
|
224
|
+
when ".zip" then read_zip(file)
|
|
225
|
+
when ".xml" then File.read(file, encoding: "UTF-8")
|
|
226
|
+
end
|
|
227
|
+
return nil unless xml
|
|
228
|
+
return nil if cheap_extract_field(xml, "standard_id") == "0"
|
|
229
|
+
|
|
230
|
+
normtitle = cheap_extract_field(xml, "normtitle")
|
|
231
|
+
stdnumber = cheap_extract_field(xml, "stdnumber")
|
|
232
|
+
docnumber = nil
|
|
233
|
+
if normtitle && stdnumber
|
|
234
|
+
pubid = RawbibIdParser.parse(normtitle, stdnumber)
|
|
235
|
+
docnumber = pubid&.to_id
|
|
236
|
+
end
|
|
237
|
+
[idx, file, docnumber, file.include?("/updates.")]
|
|
238
|
+
rescue StandardError
|
|
239
|
+
# Cheap parse couldn't handle this file — keep it; full parse will
|
|
240
|
+
# either succeed or surface the real error.
|
|
241
|
+
[idx, file, nil, file.include?("/updates.")]
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def cheap_extract_field(xml, tag)
|
|
245
|
+
m = xml.match(%r{<#{tag}[^>]*?>(?:<!\[CDATA\[)?(.*?)(?:\]\]>)?</#{tag}>}m)
|
|
246
|
+
m && m[1].strip
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def select_prefilter_winners(index, total)
|
|
250
|
+
unknown = index.select { |e| e[2].nil? }
|
|
251
|
+
by_doc = index.reject { |e| e[2].nil? }.group_by { |e| e[2] }
|
|
252
|
+
|
|
253
|
+
selected = []
|
|
254
|
+
by_doc.each_value do |entries|
|
|
255
|
+
updates = entries.select { |e| e[3] }
|
|
256
|
+
if updates.any?
|
|
257
|
+
selected << updates.max_by { |e| e[0] }
|
|
258
|
+
else
|
|
259
|
+
selected.concat(entries)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
kept = (selected + unknown).sort_by { |e| e[0] }.map { |e| e[1] }
|
|
264
|
+
Util.warn "Prefilter: #{total} input files -> #{kept.size} winners " \
|
|
265
|
+
"(#{(100.0 * kept.size / total).round(1)}%)"
|
|
266
|
+
kept
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
#
|
|
270
|
+
# Parse files across a pool of short-lived forked workers. Each
|
|
271
|
+
# worker processes one bounded batch (IEEE_FETCH_BATCH files,
|
|
272
|
+
# default 5000), writes its output YAMLs to disk, marshals its
|
|
273
|
+
# local backrefs / crossrefs / errors to a tmp file, and exits.
|
|
274
|
+
# The parent keeps `procs` workers in flight; as each one exits
|
|
275
|
+
# it merges that worker's state and spawns the next batch.
|
|
113
276
|
#
|
|
114
|
-
#
|
|
115
|
-
#
|
|
277
|
+
# Why short-lived workers, not one long-running shard per core:
|
|
278
|
+
# Ruby's heap grows monotonically and the VM doesn't return
|
|
279
|
+
# freed memory to the OS, so a child that parses 50k files ends
|
|
280
|
+
# up at 1+ GB RSS even with the docs cache disabled. With ten
|
|
281
|
+
# such children the box swaps and slows to a crawl. Exiting a
|
|
282
|
+
# child after a batch of a few thousand files lets the OS
|
|
283
|
+
# reclaim its heap; the next fork starts fresh from the parent's
|
|
284
|
+
# baseline. Fork is cheap (copy-on-write), so the overhead is
|
|
285
|
+
# negligible compared to the memory savings.
|
|
116
286
|
#
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
287
|
+
# Caveats from sharding (same as the previous design):
|
|
288
|
+
# - Cross-batch duplicates: when the same docnumber appears in
|
|
289
|
+
# multiple batches, the last-finishing batch's write wins.
|
|
290
|
+
# Merged backrefs/crossrefs are still complete, so
|
|
291
|
+
# update_relations resolves cross-refs correctly.
|
|
292
|
+
# - "Document exists" warnings are per-batch, so cross-batch
|
|
293
|
+
# duplicates may not log a warning. Logging only.
|
|
294
|
+
#
|
|
295
|
+
# @param [Array<String>] files paths to rawbib XML/zip files
|
|
296
|
+
#
|
|
297
|
+
def process_files(files) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
|
298
|
+
procs = Integer(ENV["IEEE_FETCH_PROCESSES"] || Etc.nprocessors)
|
|
299
|
+
procs = 1 if files.empty? || procs < 2 || files.size < procs * 2
|
|
300
|
+
|
|
301
|
+
return run_shard(files, 0) if procs <= 1
|
|
302
|
+
|
|
303
|
+
batch_size = Integer(ENV["IEEE_FETCH_BATCH"] || 1000)
|
|
304
|
+
batches = files.each_slice(batch_size).each_with_index.to_a
|
|
305
|
+
|
|
306
|
+
state_paths = run_worker_pool(batches, procs)
|
|
307
|
+
merge_state_files(state_paths)
|
|
308
|
+
reconcile_staged_outputs
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
#
|
|
312
|
+
# Promote the highest-glob-index staged write per docnumber to its
|
|
313
|
+
# final on-disk filename, then delete any leftover staged files.
|
|
314
|
+
# Restores exact "latest update wins" semantics across batches:
|
|
315
|
+
# without this pass, a slow batch finishing late could overwrite a
|
|
316
|
+
# newer update that an earlier-completing batch had already saved.
|
|
317
|
+
#
|
|
318
|
+
def reconcile_staged_outputs
|
|
319
|
+
return if saved_writes.empty?
|
|
320
|
+
|
|
321
|
+
saved_writes.each do |docnumber, max_idx|
|
|
322
|
+
final = output_file(docnumber)
|
|
323
|
+
winner = "#{final}.#{max_idx}"
|
|
324
|
+
File.rename(winner, final) if File.exist?(winner)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
# Stragglers: any remaining staged files (losing duplicates,
|
|
328
|
+
# or bib filenames that didn't end up in saved_writes due to a
|
|
329
|
+
# crash) get cleaned up so they don't pollute `data/`.
|
|
330
|
+
Dir.glob(File.join(@output, "*.#{@ext}.*")).each do |f|
|
|
331
|
+
File.unlink(f)
|
|
120
332
|
rescue StandardError
|
|
121
|
-
|
|
122
|
-
|
|
333
|
+
# ignore — best-effort cleanup
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
#
|
|
338
|
+
# Merge all batch state files into the parent's hashes. Runs once,
|
|
339
|
+
# after the worker pool has drained, so the parent's heap only
|
|
340
|
+
# has to hold the cumulative merged state (small) plus one batch's
|
|
341
|
+
# transient marshaled payload at a time.
|
|
342
|
+
#
|
|
343
|
+
def merge_state_files(state_paths)
|
|
344
|
+
state_paths.each_with_index do |path, i|
|
|
345
|
+
merge_batch_state(path)
|
|
346
|
+
# Periodic GC.start keeps the transient marshal allocations
|
|
347
|
+
# from piling up over hundreds of merges.
|
|
348
|
+
GC.start if (i % 50).zero?
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
#
|
|
353
|
+
# Maintain `procs` concurrent short-lived workers. Each Process.wait
|
|
354
|
+
# call blocks until any worker exits; we collect its state-file
|
|
355
|
+
# path and spawn the next batch (if any).
|
|
356
|
+
#
|
|
357
|
+
# Critically, we do NOT merge state into the parent's hashes here.
|
|
358
|
+
# Loading and merging dozens of MB of marshaled hashes per batch
|
|
359
|
+
# bloated the parent's heap into the multi-GB range, and every
|
|
360
|
+
# subsequent fork inherited that bloat via copy-on-write — driving
|
|
361
|
+
# the box into swap. By deferring all merging to after the parsing
|
|
362
|
+
# phase, the parent stays at ~baseline RSS while children are alive,
|
|
363
|
+
# so each fork's COW baseline is small.
|
|
364
|
+
#
|
|
365
|
+
# @return [Array<String>] state-file paths in completion order
|
|
366
|
+
#
|
|
367
|
+
def run_worker_pool(batches, procs) # rubocop:disable Metrics/MethodLength
|
|
368
|
+
next_batch = 0
|
|
369
|
+
inflight = {} # pid => state_path
|
|
370
|
+
collected = []
|
|
371
|
+
|
|
372
|
+
procs.times do
|
|
373
|
+
break if next_batch >= batches.size
|
|
374
|
+
|
|
375
|
+
inflight.merge!(spawn_batch(*batches[next_batch]))
|
|
376
|
+
next_batch += 1
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
until inflight.empty?
|
|
380
|
+
pid = Process.wait
|
|
381
|
+
collected << inflight.delete(pid)
|
|
382
|
+
|
|
383
|
+
if next_batch < batches.size
|
|
384
|
+
inflight.merge!(spawn_batch(*batches[next_batch]))
|
|
385
|
+
next_batch += 1
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
collected
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
#
|
|
393
|
+
# Fork one short-lived worker for a single batch. Returns a
|
|
394
|
+
# `{pid => state_path}` Hash. The worker writes its marshaled
|
|
395
|
+
# local state to `state_path` then exits; the tmp file is read
|
|
396
|
+
# and unlinked by the parent in `merge_batch_state`.
|
|
397
|
+
#
|
|
398
|
+
def spawn_batch(batch_files, batch_idx) # rubocop:disable Metrics/MethodLength
|
|
399
|
+
require "tmpdir"
|
|
400
|
+
require "securerandom"
|
|
401
|
+
state_path = File.join(
|
|
402
|
+
Dir.tmpdir,
|
|
403
|
+
"ieee_fetch_#{Process.pid}_#{batch_idx}_#{SecureRandom.hex(4)}.bin",
|
|
404
|
+
)
|
|
405
|
+
base_idx = batch_idx * Integer(ENV["IEEE_FETCH_BATCH"] || 1000)
|
|
406
|
+
|
|
407
|
+
pid = Process.fork do
|
|
408
|
+
batch_files.each_with_index do |file, i|
|
|
409
|
+
glob_idx = base_idx + i
|
|
410
|
+
result = parse_entry(glob_idx, file)
|
|
411
|
+
next unless result
|
|
412
|
+
|
|
413
|
+
_, _, doc, bib, local_errors = result
|
|
414
|
+
merge_errors(local_errors)
|
|
415
|
+
commit_doc(doc, bib, file, glob_idx)
|
|
416
|
+
end
|
|
417
|
+
File.binwrite(state_path, Marshal.dump(
|
|
418
|
+
backrefs: backrefs,
|
|
419
|
+
crossrefs: {}.merge(crossrefs),
|
|
420
|
+
errors: {}.merge(@errors),
|
|
421
|
+
saved_writes: saved_writes,
|
|
422
|
+
))
|
|
423
|
+
exit!(0)
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
{ pid => state_path }
|
|
427
|
+
end
|
|
428
|
+
|
|
429
|
+
#
|
|
430
|
+
# Read one batch's marshaled state, merge into parent state,
|
|
431
|
+
# remove the tmp file. Tolerates a missing/empty file (worker
|
|
432
|
+
# crash) by treating it as an empty merge.
|
|
433
|
+
#
|
|
434
|
+
def merge_batch_state(state_path)
|
|
435
|
+
if state_path && File.exist?(state_path) && File.size(state_path).positive?
|
|
436
|
+
payload = Marshal.load(File.binread(state_path))
|
|
437
|
+
merge_shard_state(payload)
|
|
438
|
+
end
|
|
439
|
+
ensure
|
|
440
|
+
File.unlink(state_path) if state_path && File.exist?(state_path)
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
#
|
|
444
|
+
# Merge one child's per-shard state into the parent's. backrefs uses
|
|
445
|
+
# ||= so the lowest-shard-id value wins for any amsid/docnumber pair
|
|
446
|
+
# that happens to appear in multiple shards (in practice they agree).
|
|
447
|
+
# `saved_writes` tracks the highest glob-index at which any worker
|
|
448
|
+
# saved a doc, so the parent can later rename the winning staged
|
|
449
|
+
# file to its final name.
|
|
450
|
+
#
|
|
451
|
+
def merge_shard_state(state)
|
|
452
|
+
state[:backrefs].each { |amsid, content| backrefs[amsid] ||= content }
|
|
453
|
+
state[:crossrefs].each { |dnum, refs| crossrefs[dnum].concat(refs) }
|
|
454
|
+
state[:errors].each { |k, v| @errors[k] &&= v }
|
|
455
|
+
(state[:saved_writes] || {}).each do |dnum, idx|
|
|
456
|
+
prev = saved_writes[dnum]
|
|
457
|
+
saved_writes[dnum] = idx if prev.nil? || idx > prev
|
|
123
458
|
end
|
|
124
|
-
|
|
459
|
+
end
|
|
125
460
|
|
|
126
|
-
|
|
461
|
+
#
|
|
462
|
+
# Process one shard sequentially. Either runs in a forked child or,
|
|
463
|
+
# when procs == 1, in the parent.
|
|
464
|
+
#
|
|
465
|
+
# `shard` is an array of [original_idx, file] tuples (or, when
|
|
466
|
+
# called from the procs==1 fallback, just the array of file paths
|
|
467
|
+
# — we normalize below).
|
|
468
|
+
#
|
|
469
|
+
def run_shard(shard, _shard_idx)
|
|
470
|
+
shard.each_with_index do |entry, i|
|
|
471
|
+
idx, file = entry.is_a?(Array) ? entry : [i, entry]
|
|
472
|
+
result = parse_entry(idx, file)
|
|
473
|
+
next unless result
|
|
474
|
+
|
|
475
|
+
_, _, doc, bib, local_errors = result
|
|
476
|
+
merge_errors(local_errors)
|
|
477
|
+
commit_doc(doc, bib, file)
|
|
478
|
+
end
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
#
|
|
482
|
+
# Worker-thread entry point: read file, parse XML, build bib.
|
|
483
|
+
# Returns nil for files we should skip; otherwise a tuple consumed
|
|
484
|
+
# in submission order by the main-thread commit loop.
|
|
485
|
+
#
|
|
486
|
+
# @param [Integer] idx original glob index (preserves dedup order)
|
|
487
|
+
# @param [String] file path to rawbib file
|
|
488
|
+
#
|
|
489
|
+
# @return [Array, nil] [idx, file, doc, bib, local_errors] or nil
|
|
490
|
+
#
|
|
491
|
+
def parse_entry(idx, file)
|
|
492
|
+
xml = case File.extname(file)
|
|
493
|
+
when ".zip" then read_zip file
|
|
494
|
+
when ".xml" then File.read file, encoding: "UTF-8"
|
|
495
|
+
end
|
|
496
|
+
doc = begin
|
|
497
|
+
::Ieee::Idams::Publication.from_xml(xml)
|
|
498
|
+
rescue StandardError
|
|
499
|
+
Util.warn "Empty file: `#{file}`"
|
|
500
|
+
return nil
|
|
501
|
+
end
|
|
502
|
+
return nil if doc.publicationinfo&.standard_id == "0"
|
|
503
|
+
|
|
504
|
+
local_errors = Hash.new(true)
|
|
505
|
+
bib = IdamsParser.new(doc, self, local_errors).parse
|
|
127
506
|
if bib.docnumber.nil?
|
|
128
|
-
Util.warn "PubID parse error. Normtitle: `#{doc.normtitle}`, file: `#{
|
|
129
|
-
return
|
|
507
|
+
Util.warn "PubID parse error. Normtitle: `#{doc.normtitle}`, file: `#{file}`"
|
|
508
|
+
return nil
|
|
130
509
|
end
|
|
510
|
+
[idx, file, doc, bib, local_errors]
|
|
511
|
+
rescue StandardError => e
|
|
512
|
+
Util.error "File: #{file}\n#{e.message}\n#{e.backtrace}"
|
|
513
|
+
nil
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
#
|
|
517
|
+
# Merge a worker's local errors hash into the shared @errors hash,
|
|
518
|
+
# preserving the existing AND semantics (`@errors[k] &&= v`).
|
|
519
|
+
#
|
|
520
|
+
def merge_errors(local_errors)
|
|
521
|
+
local_errors.each { |k, v| @errors[k] &&= v }
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
#
|
|
525
|
+
# Dedup against backrefs and save. This runs once per parsed file —
|
|
526
|
+
# in the parent for the procs==1 fallback, or in each forked child
|
|
527
|
+
# for its shard. Same logic the old fetch_doc tail had, plus
|
|
528
|
+
# optional staged-output bookkeeping when `glob_idx` is provided.
|
|
529
|
+
#
|
|
530
|
+
# When `glob_idx` is given (parallel mode), save_doc writes to a
|
|
531
|
+
# per-glob-index suffixed path; the parent reconciles after the
|
|
532
|
+
# parsing phase and renames the highest-glob-index winner per
|
|
533
|
+
# docnumber to the final filename. This preserves the original
|
|
534
|
+
# "latest update wins on disk" semantic across batch boundaries
|
|
535
|
+
# — without it, a slow batch finishing late could overwrite a
|
|
536
|
+
# newer update written by an earlier-completing batch.
|
|
537
|
+
#
|
|
538
|
+
def commit_doc(doc, bib, filename, glob_idx = nil)
|
|
131
539
|
amsid = doc.publicationinfo.amsid
|
|
132
540
|
if backrefs.value?(bib.docidentifier[0].content) && /updates\.\d+/ !~ filename
|
|
133
541
|
oamsid = backrefs.key bib.docidentifier[0].content
|
|
134
542
|
Util.warn "Document exists ID: `#{bib.docidentifier[0].content}` AMSID: " \
|
|
135
543
|
"`#{amsid}` source: `#{filename}`. Other AMSID: `#{oamsid}`"
|
|
136
544
|
if bib.docidentifier.find(&:primary).content.include?(doc.publicationinfo.stdnumber)
|
|
137
|
-
save_doc
|
|
545
|
+
save_doc(bib, glob_idx) # rewrite file if the PubID matches to the stdnumber
|
|
138
546
|
backrefs[amsid] = bib.docidentifier[0].content
|
|
547
|
+
track_save(bib.docnumber, glob_idx)
|
|
139
548
|
end
|
|
140
549
|
else
|
|
141
|
-
save_doc
|
|
550
|
+
save_doc(bib, glob_idx)
|
|
142
551
|
backrefs[amsid] = bib.docidentifier[0].content
|
|
552
|
+
track_save(bib.docnumber, glob_idx)
|
|
143
553
|
end
|
|
144
554
|
end
|
|
145
555
|
|
|
146
556
|
#
|
|
147
|
-
#
|
|
557
|
+
# Record that we wrote a staged copy of `docnumber` at this
|
|
558
|
+
# `glob_idx`. The parent later picks the highest tracked idx
|
|
559
|
+
# per docnumber as the surviving on-disk version.
|
|
560
|
+
#
|
|
561
|
+
def track_save(docnumber, glob_idx)
|
|
562
|
+
return unless glob_idx
|
|
563
|
+
|
|
564
|
+
prev = saved_writes[docnumber]
|
|
565
|
+
saved_writes[docnumber] = glob_idx if prev.nil? || glob_idx > prev
|
|
566
|
+
end
|
|
567
|
+
|
|
568
|
+
#
|
|
569
|
+
# Save document to file. When `glob_idx` is provided (parallel
|
|
570
|
+
# mode), writes to a per-glob-index suffixed staging path so
|
|
571
|
+
# concurrent workers can't clobber each other's files; the parent
|
|
572
|
+
# reconciles after parsing. With no glob_idx, writes the final
|
|
573
|
+
# filename directly (sequential mode and update_relations).
|
|
148
574
|
#
|
|
149
575
|
# @param [RelatonIeee::IeeeBibliographicItem] bib
|
|
576
|
+
# @param [Integer, nil] glob_idx position in the original file glob
|
|
150
577
|
#
|
|
151
|
-
def save_doc(bib)
|
|
152
|
-
|
|
578
|
+
def save_doc(bib, glob_idx = nil)
|
|
579
|
+
path = output_file(bib.docnumber)
|
|
580
|
+
path = "#{path}.#{glob_idx}" if glob_idx
|
|
581
|
+
File.write path, serialize(bib), encoding: "UTF-8"
|
|
153
582
|
end
|
|
154
583
|
|
|
155
584
|
def to_yaml(bib) = bib.to_yaml
|
|
@@ -157,23 +586,27 @@ module Relaton
|
|
|
157
586
|
def to_bibxml(bib) = bib.to_rfcxml
|
|
158
587
|
|
|
159
588
|
#
|
|
160
|
-
#
|
|
589
|
+
# Resolve cross-references collected during parse. Uses the in-memory
|
|
590
|
+
# `docs` cache so we don't re-read+re-deserialize files from disk, and
|
|
591
|
+
# writes each mutated bib once instead of once per relation.
|
|
161
592
|
#
|
|
162
593
|
def update_relations # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
163
594
|
crossrefs.each do |dnum, rfs|
|
|
164
595
|
bib = nil
|
|
596
|
+
mutated = false
|
|
165
597
|
rfs.each do |rf|
|
|
166
598
|
if backrefs[rf[:amsid]]
|
|
167
599
|
rel = create_relation(rf[:type], backrefs[rf[:amsid]])
|
|
168
600
|
if rel
|
|
169
|
-
bib ||= read_bib(dnum)
|
|
601
|
+
bib ||= docs[dnum] || read_bib(dnum)
|
|
170
602
|
bib.relation << rel
|
|
171
|
-
|
|
603
|
+
mutated = true
|
|
172
604
|
end
|
|
173
605
|
else
|
|
174
606
|
Util.warn "Unresolved relation: '#{rf[:amsid]}' type: '#{rf[:type]}' for '#{dnum}'"
|
|
175
607
|
end
|
|
176
608
|
end
|
|
609
|
+
save_doc(bib) if mutated
|
|
177
610
|
end
|
|
178
611
|
end
|
|
179
612
|
|
|
@@ -11,6 +11,12 @@ module Relaton
|
|
|
11
11
|
relation source keyword ext
|
|
12
12
|
].freeze
|
|
13
13
|
|
|
14
|
+
# Upstream IDAMS abstracts sometimes carry escaped ASCII control
|
|
15
|
+
# characters as printable tokens like `<<ETX>>`. They are meaningless
|
|
16
|
+
# in output, and `<<…>>` blows up XML serialization downstream
|
|
17
|
+
# (libxml2 reads `<<` as the start of a tag). Strip the whole family.
|
|
18
|
+
CONTROL_PLACEHOLDER_RE = /<<[A-Z]{2,5}>>/.freeze
|
|
19
|
+
|
|
14
20
|
def initialize(doc, fetcher, errors = {})
|
|
15
21
|
@doc = doc
|
|
16
22
|
@fetcher = fetcher
|
|
@@ -164,7 +170,10 @@ module Relaton
|
|
|
164
170
|
result = @doc.volume.article.articleinfo.abstract.each_with_object([]) do |abs, acc|
|
|
165
171
|
next unless abs.abstract_type == "Standard"
|
|
166
172
|
|
|
167
|
-
|
|
173
|
+
content = abs.value.gsub(CONTROL_PLACEHOLDER_RE, "").strip
|
|
174
|
+
next if content.empty?
|
|
175
|
+
|
|
176
|
+
acc << Bib::Abstract.new(content: content, language: "en", script: "Latn")
|
|
168
177
|
end
|
|
169
178
|
@errors[:abstract] &&= result.empty?
|
|
170
179
|
result
|
data/lib/relaton/ieee/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-ieee
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: faraday
|