relaton-ieee 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 64d85b643d5c8107a3b2c4aa5c66d303056a0ff29d8b82dca5c3b2ca4506a914
4
- data.tar.gz: bb114fc3be23077ca00e43887f76d078374f8c20d0266750b51e598e092adc01
3
+ metadata.gz: 5a2e8b9dc4134eba7c8047ca10a447fa89a6cad9df60006130e45fff3f8a917c
4
+ data.tar.gz: 2a3b2d228652b90e6db5bc9bc77b36d6a07bb51ddf38b211c2a8ecb17b85de97
5
5
  SHA512:
6
- metadata.gz: 4c43e3ad8cfd2ecf686ecebdd0059165d34e9a4bc162623cf0d7520fc5fb4f4cc63e05e57bc9b70e4c55b024e25e0785f301bb4c30d07c85c0e36efdbca94be9
7
- data.tar.gz: c0fc2716834fb2d153c3c39575ea24535f68f7c237604d195ee362175880f7f99d85d316df30b59b68685447f7de7877dce105908c98447de941ccdbed6412d7
6
+ metadata.gz: cfca04d99a0f3c8eb8ca20168cbb9d3c36ca6806db8537c604f86bd9d21419c44d67250c825ea6a1e4001a1969931c8db69bb320829b3e074e7f3eb259df3861
7
+ data.tar.gz: 2575d47ae68327ba17c780f986a0a0a50511b73bbb459a23a95aada71771f845181da15f63bd9d01ab6ce133c275641afd81628c3e1a84f67e768b710499de4b
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ namespace :spec do
11
11
  require "net/http"
12
12
  require "uri"
13
13
 
14
- url = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/data-v2/index-v1.zip"
14
+ url = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/v2/index-v1.zip"
15
15
  dest = File.join(__dir__, "spec", "fixtures", "index-v1.zip")
16
16
 
17
17
  puts "Downloading \#{url} ..."
@@ -1,7 +1,7 @@
1
1
  module Relaton
2
2
  module Ieee
3
3
  class Bibliography
4
- GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/refs/heads/data-v2/".freeze
4
+ GH_URL = "https://raw.githubusercontent.com/relaton/relaton-data-ieee/refs/heads/v2/".freeze
5
5
 
6
6
  class << self
7
7
  #
@@ -1,3 +1,4 @@
1
+ require "etc"
1
2
  require "zip"
2
3
  require_relative "../ieee"
3
4
  require_relative "converter/bibxml"
@@ -26,17 +27,10 @@ module Relaton
26
27
  Util.error msg
27
28
  end
28
29
 
29
- def fetch(_source = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
30
- Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }.each do |f|
31
- xml = case File.extname(f)
32
- when ".zip" then read_zip f
33
- when ".xml" then File.read f, encoding: "UTF-8"
34
- end
35
- fetch_doc xml, f
36
- rescue StandardError => e
37
- Util.error "File: #{f}\n#{e.message}\n#{e.backtrace}"
38
- end
39
- # File.write "normtitles.txt", @normtitles.join("\n")
30
+ def fetch(_source = nil)
31
+ files = Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }
32
+ files = prefilter_winners(files) unless ENV["IEEE_FETCH_PREFILTER"] == "0"
33
+ process_files(files)
40
34
  update_relations
41
35
  report_errors
42
36
  end
@@ -46,8 +40,27 @@ module Relaton
46
40
  @backrefs ||= {}
47
41
  end
48
42
 
43
+ # @return [Hash] list of docnumber => parsed bib (cache for update_relations)
44
+ def docs
45
+ @docs ||= {}
46
+ end
47
+
48
+ # @return [Hash] docnumber => max global glob-index whose write was
49
+ # accepted by commit_doc. Populated only when running with parallel
50
+ # workers (writes are staged to per-glob-index suffixed paths and
51
+ # reconciled into the final filename after the parsing phase).
52
+ def saved_writes
53
+ @saved_writes ||= {}
54
+ end
55
+
56
+ # Mutex guarding worker-thread mutations of shared state during parse.
57
+ def mutex
58
+ @mutex ||= Mutex.new
59
+ end
60
+
49
61
  #
50
- # Save unresolved relation reference
62
+ # Save unresolved relation reference. Called from worker threads via
63
+ # IdamsParser#parse_relation, so mutates crossrefs under a mutex.
51
64
  #
52
65
  # @param [String] docnumber of main document
53
66
  # @param [Nokogiri::XML::Element] amsid relation data
@@ -56,7 +69,7 @@ module Relaton
56
69
  return if RELATION_TYPES[amsid.type] == false
57
70
 
58
71
  ref = { amsid: amsid.date_string, type: amsid.type }
59
- crossrefs[docnumber] << ref
72
+ mutex.synchronize { crossrefs[docnumber] << ref }
60
73
  end
61
74
 
62
75
  #
@@ -109,47 +122,463 @@ module Relaton
109
122
  end
110
123
 
111
124
  #
112
- # Parse document and save it
125
+ # Pre-filter the input file list down to the subset that actually
126
+ # has to be fully parsed.
127
+ #
128
+ # The IEEE rawbib dataset has ~50× duplication: every docnumber
129
+ # appears in `cache/` plus most `updates.YYYYMMDD/` folders. The
130
+ # original semantic is "latest update wins on disk", so for any
131
+ # docnumber that has at least one updates-folder file, the cache
132
+ # file's parse result is just thrown away. Pre-filter avoids
133
+ # parsing those throwaway files entirely.
134
+ #
135
+ # The cheap path here only has to extract three small XML elements
136
+ # (normtitle, stdnumber, standard_id) per file — done with
137
+ # regex on the raw XML so we skip lutaml-model's heavy DOM-to-
138
+ # object construction (which is what dominates fetch time).
139
+ #
140
+ # Selection rules:
141
+ # - For each docnumber with any updates-folder entry: keep only
142
+ # the highest-glob-idx updates-folder file.
143
+ # - For docnumbers with cache-folder entries only: keep all
144
+ # of them (commit_doc's matches-stdnumber dedup handles them).
145
+ # - Files where the cheap parse couldn't compute a docnumber
146
+ # are kept as-is — the full parse will surface any real error.
147
+ #
148
+ # Disable with IEEE_FETCH_PREFILTER=0.
149
+ #
150
+ def prefilter_winners(files)
151
+ threshold = Integer(ENV["IEEE_FETCH_PREFILTER_MIN"] || 200)
152
+ return files if files.size < threshold
153
+
154
+ procs = Integer(ENV["IEEE_FETCH_PROCESSES"] || Etc.nprocessors)
155
+ index = procs <= 1 ? prefilter_serial(files) : prefilter_parallel(files, procs)
156
+ select_prefilter_winners(index, files.size)
157
+ end
158
+
159
+ def prefilter_serial(files)
160
+ files.each_with_index.map { |f, i| extract_index_entry(i, f) }.compact
161
+ end
162
+
163
+ def prefilter_parallel(files, procs) # rubocop:disable Metrics/MethodLength
164
+ batch_size = Integer(ENV["IEEE_PREFILTER_BATCH"] || 5000)
165
+ batches = files.each_slice(batch_size).each_with_index.to_a
166
+
167
+ next_batch = 0
168
+ inflight = {}
169
+ collected = []
170
+
171
+ procs.times do
172
+ break if next_batch >= batches.size
173
+
174
+ inflight.merge!(spawn_prefilter_batch(*batches[next_batch], batch_size))
175
+ next_batch += 1
176
+ end
177
+
178
+ until inflight.empty?
179
+ pid = Process.wait
180
+ collected << inflight.delete(pid)
181
+
182
+ if next_batch < batches.size
183
+ inflight.merge!(spawn_prefilter_batch(*batches[next_batch], batch_size))
184
+ next_batch += 1
185
+ end
186
+ end
187
+
188
+ index = []
189
+ collected.each do |path|
190
+ next unless path && File.exist?(path) && File.size(path).positive?
191
+
192
+ index.concat(Marshal.load(File.binread(path)))
193
+ File.unlink(path)
194
+ end
195
+ index
196
+ end
197
+
198
+ def spawn_prefilter_batch(batch_files, batch_idx, batch_size)
199
+ require "tmpdir"
200
+ require "securerandom"
201
+ state_path = File.join(
202
+ Dir.tmpdir,
203
+ "ieee_prefilter_#{Process.pid}_#{batch_idx}_#{SecureRandom.hex(4)}.bin",
204
+ )
205
+ base_idx = batch_idx * batch_size
206
+
207
+ pid = Process.fork do
208
+ entries = batch_files.each_with_index.map do |file, i|
209
+ extract_index_entry(base_idx + i, file)
210
+ end.compact
211
+ File.binwrite(state_path, Marshal.dump(entries))
212
+ exit!(0)
213
+ end
214
+ { pid => state_path }
215
+ end
216
+
217
+ #
218
+ # Cheap-parse one file: read XML, regex-extract three fields,
219
+ # compute docnumber via the existing RawbibIdParser. Returns
220
+ # `[glob_idx, file, docnumber_or_nil, in_updates_folder?]`.
221
+ #
222
+ def extract_index_entry(idx, file)
223
+ xml = case File.extname(file)
224
+ when ".zip" then read_zip(file)
225
+ when ".xml" then File.read(file, encoding: "UTF-8")
226
+ end
227
+ return nil unless xml
228
+ return nil if cheap_extract_field(xml, "standard_id") == "0"
229
+
230
+ normtitle = cheap_extract_field(xml, "normtitle")
231
+ stdnumber = cheap_extract_field(xml, "stdnumber")
232
+ docnumber = nil
233
+ if normtitle && stdnumber
234
+ pubid = RawbibIdParser.parse(normtitle, stdnumber)
235
+ docnumber = pubid&.to_id
236
+ end
237
+ [idx, file, docnumber, file.include?("/updates.")]
238
+ rescue StandardError
239
+ # Cheap parse couldn't handle this file — keep it; full parse will
240
+ # either succeed or surface the real error.
241
+ [idx, file, nil, file.include?("/updates.")]
242
+ end
243
+
244
+ def cheap_extract_field(xml, tag)
245
+ m = xml.match(%r{<#{tag}[^>]*?>(?:<!\[CDATA\[)?(.*?)(?:\]\]>)?</#{tag}>}m)
246
+ m && m[1].strip
247
+ end
248
+
249
+ def select_prefilter_winners(index, total)
250
+ unknown = index.select { |e| e[2].nil? }
251
+ by_doc = index.reject { |e| e[2].nil? }.group_by { |e| e[2] }
252
+
253
+ selected = []
254
+ by_doc.each_value do |entries|
255
+ updates = entries.select { |e| e[3] }
256
+ if updates.any?
257
+ selected << updates.max_by { |e| e[0] }
258
+ else
259
+ selected.concat(entries)
260
+ end
261
+ end
262
+
263
+ kept = (selected + unknown).sort_by { |e| e[0] }.map { |e| e[1] }
264
+ Util.warn "Prefilter: #{total} input files -> #{kept.size} winners " \
265
+ "(#{(100.0 * kept.size / total).round(1)}%)"
266
+ kept
267
+ end
268
+
269
+ #
270
+ # Parse files across a pool of short-lived forked workers. Each
271
+ # worker processes one bounded batch (IEEE_FETCH_BATCH files,
272
+ # default 5000), writes its output YAMLs to disk, marshals its
273
+ # local backrefs / crossrefs / errors to a tmp file, and exits.
274
+ # The parent keeps `procs` workers in flight; as each one exits
275
+ # it merges that worker's state and spawns the next batch.
113
276
  #
114
- # @param [String] xml content
115
- # @param [String] filename source file
277
+ # Why short-lived workers, not one long-running shard per core:
278
+ # Ruby's heap grows monotonically and the VM doesn't return
279
+ # freed memory to the OS, so a child that parses 50k files ends
280
+ # up at 1+ GB RSS even with the docs cache disabled. With ten
281
+ # such children the box swaps and slows to a crawl. Exiting a
282
+ # child after a batch of a few thousand files lets the OS
283
+ # reclaim its heap; the next fork starts fresh from the parent's
284
+ # baseline. Fork is cheap (copy-on-write), so the overhead is
285
+ # negligible compared to the memory savings.
116
286
  #
117
- def fetch_doc(xml, filename) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
118
- begin
119
- doc = ::Ieee::Idams::Publication.from_xml(xml)
287
+ # Caveats from sharding (same as the previous design):
288
+ # - Cross-batch duplicates: when the same docnumber appears in
289
+ # multiple batches, the last-finishing batch's write wins.
290
+ # Merged backrefs/crossrefs are still complete, so
291
+ # update_relations resolves cross-refs correctly.
292
+ # - "Document exists" warnings are per-batch, so cross-batch
293
+ # duplicates may not log a warning. Logging only.
294
+ #
295
+ # @param [Array<String>] files paths to rawbib XML/zip files
296
+ #
297
+ def process_files(files) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
298
+ procs = Integer(ENV["IEEE_FETCH_PROCESSES"] || Etc.nprocessors)
299
+ procs = 1 if files.empty? || procs < 2 || files.size < procs * 2
300
+
301
+ return run_shard(files, 0) if procs <= 1
302
+
303
+ batch_size = Integer(ENV["IEEE_FETCH_BATCH"] || 1000)
304
+ batches = files.each_slice(batch_size).each_with_index.to_a
305
+
306
+ state_paths = run_worker_pool(batches, procs)
307
+ merge_state_files(state_paths)
308
+ reconcile_staged_outputs
309
+ end
310
+
311
+ #
312
+ # Promote the highest-glob-index staged write per docnumber to its
313
+ # final on-disk filename, then delete any leftover staged files.
314
+ # Restores exact "latest update wins" semantics across batches:
315
+ # without this pass, a slow batch finishing late could overwrite a
316
+ # newer update that an earlier-completing batch had already saved.
317
+ #
318
+ def reconcile_staged_outputs
319
+ return if saved_writes.empty?
320
+
321
+ saved_writes.each do |docnumber, max_idx|
322
+ final = output_file(docnumber)
323
+ winner = "#{final}.#{max_idx}"
324
+ File.rename(winner, final) if File.exist?(winner)
325
+ end
326
+
327
+ # Stragglers: any remaining staged files (losing duplicates,
328
+ # or bib filenames that didn't end up in saved_writes due to a
329
+ # crash) get cleaned up so they don't pollute `data/`.
330
+ Dir.glob(File.join(@output, "*.#{@ext}.*")).each do |f|
331
+ File.unlink(f)
120
332
  rescue StandardError
121
- Util.warn "Empty file: `#{filename}`"
122
- return
333
+ # ignore best-effort cleanup
334
+ end
335
+ end
336
+
337
+ #
338
+ # Merge all batch state files into the parent's hashes. Runs once,
339
+ # after the worker pool has drained, so the parent's heap only
340
+ # has to hold the cumulative merged state (small) plus one batch's
341
+ # transient marshaled payload at a time.
342
+ #
343
+ def merge_state_files(state_paths)
344
+ state_paths.each_with_index do |path, i|
345
+ merge_batch_state(path)
346
+ # Periodic GC.start keeps the transient marshal allocations
347
+ # from piling up over hundreds of merges.
348
+ GC.start if (i % 50).zero?
349
+ end
350
+ end
351
+
352
+ #
353
+ # Maintain `procs` concurrent short-lived workers. Each Process.wait
354
+ # call blocks until any worker exits; we collect its state-file
355
+ # path and spawn the next batch (if any).
356
+ #
357
+ # Critically, we do NOT merge state into the parent's hashes here.
358
+ # Loading and merging dozens of MB of marshaled hashes per batch
359
+ # bloated the parent's heap into the multi-GB range, and every
360
+ # subsequent fork inherited that bloat via copy-on-write — driving
361
+ # the box into swap. By deferring all merging to after the parsing
362
+ # phase, the parent stays at ~baseline RSS while children are alive,
363
+ # so each fork's COW baseline is small.
364
+ #
365
+ # @return [Array<String>] state-file paths in completion order
366
+ #
367
+ def run_worker_pool(batches, procs) # rubocop:disable Metrics/MethodLength
368
+ next_batch = 0
369
+ inflight = {} # pid => state_path
370
+ collected = []
371
+
372
+ procs.times do
373
+ break if next_batch >= batches.size
374
+
375
+ inflight.merge!(spawn_batch(*batches[next_batch]))
376
+ next_batch += 1
377
+ end
378
+
379
+ until inflight.empty?
380
+ pid = Process.wait
381
+ collected << inflight.delete(pid)
382
+
383
+ if next_batch < batches.size
384
+ inflight.merge!(spawn_batch(*batches[next_batch]))
385
+ next_batch += 1
386
+ end
387
+ end
388
+
389
+ collected
390
+ end
391
+
392
+ #
393
+ # Fork one short-lived worker for a single batch. Returns a
394
+ # `{pid => state_path}` Hash. The worker writes its marshaled
395
+ # local state to `state_path` then exits; the tmp file is read
396
+ # and unlinked by the parent in `merge_batch_state`.
397
+ #
398
+ def spawn_batch(batch_files, batch_idx) # rubocop:disable Metrics/MethodLength
399
+ require "tmpdir"
400
+ require "securerandom"
401
+ state_path = File.join(
402
+ Dir.tmpdir,
403
+ "ieee_fetch_#{Process.pid}_#{batch_idx}_#{SecureRandom.hex(4)}.bin",
404
+ )
405
+ base_idx = batch_idx * Integer(ENV["IEEE_FETCH_BATCH"] || 1000)
406
+
407
+ pid = Process.fork do
408
+ batch_files.each_with_index do |file, i|
409
+ glob_idx = base_idx + i
410
+ result = parse_entry(glob_idx, file)
411
+ next unless result
412
+
413
+ _, _, doc, bib, local_errors = result
414
+ merge_errors(local_errors)
415
+ commit_doc(doc, bib, file, glob_idx)
416
+ end
417
+ File.binwrite(state_path, Marshal.dump(
418
+ backrefs: backrefs,
419
+ crossrefs: {}.merge(crossrefs),
420
+ errors: {}.merge(@errors),
421
+ saved_writes: saved_writes,
422
+ ))
423
+ exit!(0)
424
+ end
425
+
426
+ { pid => state_path }
427
+ end
428
+
429
+ #
430
+ # Read one batch's marshaled state, merge into parent state,
431
+ # remove the tmp file. Tolerates a missing/empty file (worker
432
+ # crash) by treating it as an empty merge.
433
+ #
434
+ def merge_batch_state(state_path)
435
+ if state_path && File.exist?(state_path) && File.size(state_path).positive?
436
+ payload = Marshal.load(File.binread(state_path))
437
+ merge_shard_state(payload)
438
+ end
439
+ ensure
440
+ File.unlink(state_path) if state_path && File.exist?(state_path)
441
+ end
442
+
443
+ #
444
+ # Merge one child's per-shard state into the parent's. backrefs uses
445
+ # ||= so the lowest-shard-id value wins for any amsid/docnumber pair
446
+ # that happens to appear in multiple shards (in practice they agree).
447
+ # `saved_writes` tracks the highest glob-index at which any worker
448
+ # saved a doc, so the parent can later rename the winning staged
449
+ # file to its final name.
450
+ #
451
+ def merge_shard_state(state)
452
+ state[:backrefs].each { |amsid, content| backrefs[amsid] ||= content }
453
+ state[:crossrefs].each { |dnum, refs| crossrefs[dnum].concat(refs) }
454
+ state[:errors].each { |k, v| @errors[k] &&= v }
455
+ (state[:saved_writes] || {}).each do |dnum, idx|
456
+ prev = saved_writes[dnum]
457
+ saved_writes[dnum] = idx if prev.nil? || idx > prev
123
458
  end
124
- return if doc.publicationinfo&.standard_id == "0"
459
+ end
125
460
 
126
- bib = IdamsParser.new(doc, self, @errors).parse
461
+ #
462
+ # Process one shard sequentially. Either runs in a forked child or,
463
+ # when procs == 1, in the parent.
464
+ #
465
+ # `shard` is an array of [original_idx, file] tuples (or, when
466
+ # called from the procs==1 fallback, just the array of file paths
467
+ # — we normalize below).
468
+ #
469
+ def run_shard(shard, _shard_idx)
470
+ shard.each_with_index do |entry, i|
471
+ idx, file = entry.is_a?(Array) ? entry : [i, entry]
472
+ result = parse_entry(idx, file)
473
+ next unless result
474
+
475
+ _, _, doc, bib, local_errors = result
476
+ merge_errors(local_errors)
477
+ commit_doc(doc, bib, file)
478
+ end
479
+ end
480
+
481
+ #
482
+ # Worker-thread entry point: read file, parse XML, build bib.
483
+ # Returns nil for files we should skip; otherwise a tuple consumed
484
+ # in submission order by the main-thread commit loop.
485
+ #
486
+ # @param [Integer] idx original glob index (preserves dedup order)
487
+ # @param [String] file path to rawbib file
488
+ #
489
+ # @return [Array, nil] [idx, file, doc, bib, local_errors] or nil
490
+ #
491
+ def parse_entry(idx, file)
492
+ xml = case File.extname(file)
493
+ when ".zip" then read_zip file
494
+ when ".xml" then File.read file, encoding: "UTF-8"
495
+ end
496
+ doc = begin
497
+ ::Ieee::Idams::Publication.from_xml(xml)
498
+ rescue StandardError
499
+ Util.warn "Empty file: `#{file}`"
500
+ return nil
501
+ end
502
+ return nil if doc.publicationinfo&.standard_id == "0"
503
+
504
+ local_errors = Hash.new(true)
505
+ bib = IdamsParser.new(doc, self, local_errors).parse
127
506
  if bib.docnumber.nil?
128
- Util.warn "PubID parse error. Normtitle: `#{doc.normtitle}`, file: `#{filename}`"
129
- return
507
+ Util.warn "PubID parse error. Normtitle: `#{doc.normtitle}`, file: `#{file}`"
508
+ return nil
130
509
  end
510
+ [idx, file, doc, bib, local_errors]
511
+ rescue StandardError => e
512
+ Util.error "File: #{file}\n#{e.message}\n#{e.backtrace}"
513
+ nil
514
+ end
515
+
516
+ #
517
+ # Merge a worker's local errors hash into the shared @errors hash,
518
+ # preserving the existing AND semantics (`@errors[k] &&= v`).
519
+ #
520
+ def merge_errors(local_errors)
521
+ local_errors.each { |k, v| @errors[k] &&= v }
522
+ end
523
+
524
+ #
525
+ # Dedup against backrefs and save. This runs once per parsed file —
526
+ # in the parent for the procs==1 fallback, or in each forked child
527
+ # for its shard. Same logic the old fetch_doc tail had, plus
528
+ # optional staged-output bookkeeping when `glob_idx` is provided.
529
+ #
530
+ # When `glob_idx` is given (parallel mode), save_doc writes to a
531
+ # per-glob-index suffixed path; the parent reconciles after the
532
+ # parsing phase and renames the highest-glob-index winner per
533
+ # docnumber to the final filename. This preserves the original
534
+ # "latest update wins on disk" semantic across batch boundaries
535
+ # — without it, a slow batch finishing late could overwrite a
536
+ # newer update written by an earlier-completing batch.
537
+ #
538
+ def commit_doc(doc, bib, filename, glob_idx = nil)
131
539
  amsid = doc.publicationinfo.amsid
132
540
  if backrefs.value?(bib.docidentifier[0].content) && /updates\.\d+/ !~ filename
133
541
  oamsid = backrefs.key bib.docidentifier[0].content
134
542
  Util.warn "Document exists ID: `#{bib.docidentifier[0].content}` AMSID: " \
135
543
  "`#{amsid}` source: `#{filename}`. Other AMSID: `#{oamsid}`"
136
544
  if bib.docidentifier.find(&:primary).content.include?(doc.publicationinfo.stdnumber)
137
- save_doc bib # rewrite file if the PubID matches to the stdnumber
545
+ save_doc(bib, glob_idx) # rewrite file if the PubID matches to the stdnumber
138
546
  backrefs[amsid] = bib.docidentifier[0].content
547
+ track_save(bib.docnumber, glob_idx)
139
548
  end
140
549
  else
141
- save_doc bib
550
+ save_doc(bib, glob_idx)
142
551
  backrefs[amsid] = bib.docidentifier[0].content
552
+ track_save(bib.docnumber, glob_idx)
143
553
  end
144
554
  end
145
555
 
146
556
  #
147
- # Save document to file
557
+ # Record that we wrote a staged copy of `docnumber` at this
558
+ # `glob_idx`. The parent later picks the highest tracked idx
559
+ # per docnumber as the surviving on-disk version.
560
+ #
561
+ def track_save(docnumber, glob_idx)
562
+ return unless glob_idx
563
+
564
+ prev = saved_writes[docnumber]
565
+ saved_writes[docnumber] = glob_idx if prev.nil? || glob_idx > prev
566
+ end
567
+
568
+ #
569
+ # Save document to file. When `glob_idx` is provided (parallel
570
+ # mode), writes to a per-glob-index suffixed staging path so
571
+ # concurrent workers can't clobber each other's files; the parent
572
+ # reconciles after parsing. With no glob_idx, writes the final
573
+ # filename directly (sequential mode and update_relations).
148
574
  #
149
575
  # @param [RelatonIeee::IeeeBibliographicItem] bib
576
+ # @param [Integer, nil] glob_idx position in the original file glob
150
577
  #
151
- def save_doc(bib)
152
- File.write output_file(bib.docnumber), serialize(bib), encoding: "UTF-8"
578
+ def save_doc(bib, glob_idx = nil)
579
+ path = output_file(bib.docnumber)
580
+ path = "#{path}.#{glob_idx}" if glob_idx
581
+ File.write path, serialize(bib), encoding: "UTF-8"
153
582
  end
154
583
 
155
584
  def to_yaml(bib) = bib.to_yaml
@@ -157,23 +586,27 @@ module Relaton
157
586
  def to_bibxml(bib) = bib.to_rfcxml
158
587
 
159
588
  #
160
- # Update unresoverd relations
589
+ # Resolve cross-references collected during parse. Uses the in-memory
590
+ # `docs` cache so we don't re-read+re-deserialize files from disk, and
591
+ # writes each mutated bib once instead of once per relation.
161
592
  #
162
593
  def update_relations # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
163
594
  crossrefs.each do |dnum, rfs|
164
595
  bib = nil
596
+ mutated = false
165
597
  rfs.each do |rf|
166
598
  if backrefs[rf[:amsid]]
167
599
  rel = create_relation(rf[:type], backrefs[rf[:amsid]])
168
600
  if rel
169
- bib ||= read_bib(dnum)
601
+ bib ||= docs[dnum] || read_bib(dnum)
170
602
  bib.relation << rel
171
- save_doc bib
603
+ mutated = true
172
604
  end
173
605
  else
174
606
  Util.warn "Unresolved relation: '#{rf[:amsid]}' type: '#{rf[:type]}' for '#{dnum}'"
175
607
  end
176
608
  end
609
+ save_doc(bib) if mutated
177
610
  end
178
611
  end
179
612
 
@@ -11,6 +11,12 @@ module Relaton
11
11
  relation source keyword ext
12
12
  ].freeze
13
13
 
14
+ # Upstream IDAMS abstracts sometimes carry escaped ASCII control
15
+ # characters as printable tokens like `<<ETX>>`. They are meaningless
16
+ # in output, and `<<…>>` blows up XML serialization downstream
17
+ # (libxml2 reads `<<` as the start of a tag). Strip the whole family.
18
+ CONTROL_PLACEHOLDER_RE = /<<[A-Z]{2,5}>>/.freeze
19
+
14
20
  def initialize(doc, fetcher, errors = {})
15
21
  @doc = doc
16
22
  @fetcher = fetcher
@@ -164,7 +170,10 @@ module Relaton
164
170
  result = @doc.volume.article.articleinfo.abstract.each_with_object([]) do |abs, acc|
165
171
  next unless abs.abstract_type == "Standard"
166
172
 
167
- acc << Bib::Abstract.new(content: abs.value, language: "en", script: "Latn")
173
+ content = abs.value.gsub(CONTROL_PLACEHOLDER_RE, "").strip
174
+ next if content.empty?
175
+
176
+ acc << Bib::Abstract.new(content: content, language: "en", script: "Latn")
168
177
  end
169
178
  @errors[:abstract] &&= result.empty?
170
179
  result
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module Ieee
3
- VERSION = "2.1.1".freeze
3
+ VERSION = "2.1.2".freeze
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-ieee
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-05 00:00:00.000000000 Z
11
+ date: 2026-05-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday