oddb2xml 3.0.18 → 3.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/History.txt +6 -0
- data/lib/oddb2xml/cli.rb +5 -0
- data/lib/oddb2xml/downloader.rb +15 -3
- data/lib/oddb2xml/extractor.rb +8 -0
- data/lib/oddb2xml/options.rb +1 -0
- data/lib/oddb2xml/proxy_check.rb +51 -11
- data/lib/oddb2xml/util.rb +19 -0
- data/lib/oddb2xml/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/oddb2xml.json +2092 -2092
- data/spec/options_spec.rb +4 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fe41f3e5cfb0e100f111a3071f32ae8dd4b4103d3f5726bf4eab061f80389323
|
|
4
|
+
data.tar.gz: 785dee3549e33031069aceac4f9be31cc844de603d3b183d94b55f976b0c482c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b385af5fc0643374e878da0cd6e1b262e4d7521c5ac5dd3a3670971f8f130ca3492b4aa296410c5e9e6627021bcbb86b508dbd3e80bf1fdd044c61851df69664
|
|
7
|
+
data.tar.gz: 40a8baa557cdb31905f8711702bbe52f8d968b67488ae88f293233ea3bd1db689c9ea75d27b4b17019cf2369fb593eed3268c3867277daf5cb73c763b6a74791
|
data/Gemfile.lock
CHANGED
data/History.txt
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
=== 3.0.20 / 09.06.2026
|
|
2
|
+
* Bugfix/robustness: handle truncated or failed Swissmedic .xlsx downloads instead of crashing later in the parser (issue #121). A download through a scanning/allow-list proxy can return an incomplete .xlsx -- a valid ZIP/Excel header (so `file` still reports "Microsoft Excel 2007+") but a missing end-of-central-directory record -- which made RubyXL die deep in rubyzip with the cryptic "Zip end of central directory signature not found". New Oddb2xml.valid_zip? verifies an archive is complete (PK header + EOCD signature in the tail). SwissmedicDownloader now validates the downloaded file and automatically re-fetches it (up to the normal retry count) when it is empty or truncated, and no longer reuses a cached broken file under --skip-download. As a last resort SwissmedicExtractor raises a clear, actionable error (pointing at connectivity / "oddb2xml --proxy-check") instead of the rubyzip backtrace.
|
|
3
|
+
|
|
4
|
+
=== 3.0.19 / 09.06.2026
|
|
5
|
+
* New option --proxy-check: probe connectivity/proxy reachability for every host oddb2xml could need, print a full OK/BLOCKED/UNREACHABLE report (honouring http(s)_proxy) and exit without downloading or building. Exits 0 if all hosts are reachable, 1 otherwise — handy for cron/deploy preflight on allow-list proxies (e.g. "oddb2xml --proxy-check"). Reuses the 3.0.18 proxy checker; checks run concurrently.
|
|
6
|
+
|
|
1
7
|
=== 3.0.18 / 09.06.2026
|
|
2
8
|
* Bugfix: stop the FHIR downloader from crashing with "Errno::ENOENT @ rb_file_s_size ... foph-sl-export-latest-de.ndjson" when run with --skip-download (issue #121). FhirDownloader#skip_download? returned true on the bare --skip-download flag and then called File.size on a file that was never downloaded. Each oddb2xml run uses its own ./downloads dir, so deploy scripts that download once and then re-run with --skip-download in a fresh dir hit this every time. skip_download? now requires the target NDJSON to actually exist on disk before honouring the flag; a missing file falls through to a normal download instead of crashing.
|
|
3
9
|
* New: proxy / connectivity preflight check (issue #121). At the very start of a run, oddb2xml now probes the outbound hosts it needs (honouring the http(s)_proxy environment) and prints a loud warning if any host is blocked by the proxy (HTTP 407 on an allow-list proxy such as Aspectra's Skyhigh gateway) or otherwise unreachable — surfacing the cause up front instead of a later empty-output/Errno symptom. The probed host set is option-aware (e.g. id.gs1.ch only with --firstbase, epl.bag.admin.ch only with --fhir). It only warns and never aborts the run; downloads still proceed and fail individually as before. Checks run concurrently (~6s worst case) and are skipped during tests; set ODDB2XML_SKIP_PROXY_CHECK=1 to silence it.
|
data/lib/oddb2xml/cli.rb
CHANGED
|
@@ -39,6 +39,11 @@ module Oddb2xml
|
|
|
39
39
|
def run
|
|
40
40
|
threads = []
|
|
41
41
|
start_time = Time.now
|
|
42
|
+
if @options[:proxy_check]
|
|
43
|
+
ok = ProxyCheck.report(@options)
|
|
44
|
+
exit(ok ? 0 : 1) unless defined?(RSpec)
|
|
45
|
+
return ok
|
|
46
|
+
end
|
|
42
47
|
ProxyCheck.run(@options)
|
|
43
48
|
files2rm = Dir.glob(File.join(DOWNLOADS, "*"))
|
|
44
49
|
FileUtils.rm_f(files2rm, verbose: true) if (files2rm.size > 0) && !Oddb2xml.skip_download?
|
data/lib/oddb2xml/downloader.rb
CHANGED
|
@@ -279,13 +279,21 @@ module Oddb2xml
|
|
|
279
279
|
def download
|
|
280
280
|
@file2save = File.join(DOWNLOADS, "swissmedic_#{@type}.xlsx")
|
|
281
281
|
report_download(@url, @file2save)
|
|
282
|
-
if @options[:calc] && @options[:skip_download] && File.exist?(@file2save) && ((Time.now - File.ctime(@file2save)).to_i < 24 * 60 * 60)
|
|
282
|
+
if @options[:calc] && @options[:skip_download] && File.exist?(@file2save) && ((Time.now - File.ctime(@file2save)).to_i < 24 * 60 * 60) && Oddb2xml.valid_zip?(@file2save)
|
|
283
283
|
Oddb2xml.log "SwissmedicDownloader #{__LINE__}: Skip downloading #{@file2save} #{File.size(@file2save)} bytes"
|
|
284
284
|
return File.expand_path(@file2save)
|
|
285
285
|
end
|
|
286
286
|
begin
|
|
287
287
|
@url = @direct_url_link
|
|
288
288
|
download_as(@file2save, "w+")
|
|
289
|
+
# The Swissmedic file is an .xlsx (a ZIP). Downloads through scanning
|
|
290
|
+
# proxies are sometimes truncated (valid header, missing EOCD), which
|
|
291
|
+
# would later crash RubyXL with a cryptic rubyzip error. Verify the
|
|
292
|
+
# archive is complete and just fetch it again if not. (issue #121)
|
|
293
|
+
unless Oddb2xml.valid_zip?(@file2save)
|
|
294
|
+
raise Oddb2xml::IncompleteDownloadError,
|
|
295
|
+
"Swissmedic #{@type} xlsx is empty or truncated (#{File.size(@file2save)} bytes)"
|
|
296
|
+
end
|
|
289
297
|
if @options[:artikelstamm]
|
|
290
298
|
# ssconvert is in the package gnumeric (Debian)
|
|
291
299
|
cmd = "ssconvert '#{@file2save}' '#{File.join(DOWNLOADS, File.basename(@file2save).sub(/\.xls.*/, ".csv"))}' 2> /dev/null"
|
|
@@ -293,8 +301,12 @@ module Oddb2xml
|
|
|
293
301
|
system(cmd)
|
|
294
302
|
end
|
|
295
303
|
return File.expand_path(@file2save)
|
|
296
|
-
rescue Timeout::Error, Errno::ETIMEDOUT
|
|
297
|
-
retrievable?
|
|
304
|
+
rescue Timeout::Error, Errno::ETIMEDOUT, Oddb2xml::IncompleteDownloadError => error
|
|
305
|
+
if retrievable?
|
|
306
|
+
Oddb2xml.log("Retrying Swissmedic #{@type} download: #{error.message}")
|
|
307
|
+
retry
|
|
308
|
+
end
|
|
309
|
+
raise
|
|
298
310
|
ensure
|
|
299
311
|
Oddb2xml.download_finished(@file2save, false)
|
|
300
312
|
end
|
data/lib/oddb2xml/extractor.rb
CHANGED
|
@@ -260,6 +260,14 @@ module Oddb2xml
|
|
|
260
260
|
@type = type
|
|
261
261
|
Oddb2xml.log("SwissmedicExtractor #{@filename} #{File.size(@filename)} bytes")
|
|
262
262
|
return unless File.exist?(@filename)
|
|
263
|
+
unless Oddb2xml.valid_zip?(@filename)
|
|
264
|
+
raise "SwissmedicExtractor: '#{@filename}' is not a usable .xlsx " \
|
|
265
|
+
"(#{File.size(@filename)} bytes). The Swissmedic '#{@type}' download failed or was " \
|
|
266
|
+
"truncated -- the file is empty, an HTML error/proxy page, or an incomplete ZIP " \
|
|
267
|
+
"(valid header but missing end-of-central-directory). Check connectivity to " \
|
|
268
|
+
"www.swissmedic.ch (run 'oddb2xml --proxy-check'); a too-small file usually means " \
|
|
269
|
+
"the proxy cut off a large download."
|
|
270
|
+
end
|
|
263
271
|
@sheet = RubyXL::Parser.parse(File.expand_path(@filename)).worksheets[0]
|
|
264
272
|
end
|
|
265
273
|
|
data/lib/oddb2xml/options.rb
CHANGED
|
@@ -46,6 +46,7 @@ module Oddb2xml
|
|
|
46
46
|
opt :use_ra11zip, "Use the ra11.zip (a zipped transfer.dat from Galexis)",
|
|
47
47
|
default: File.exist?("ra11.zip") ? "ra11.zip" : nil, type: :string
|
|
48
48
|
opt :firstbase, "Build all NONPHARMA articles on firstbase (GS1 Switzerland CSV from id.gs1.ch)", short: "b", default: false
|
|
49
|
+
opt :proxy_check, "Only probe connectivity/proxy reachability for every required host, print a report and exit (no download/build). Honours http(s)_proxy. Exits 0 if all reachable, 1 otherwise.", short: :none, default: false
|
|
49
50
|
end
|
|
50
51
|
|
|
51
52
|
@opts[:percent] = @opts[:increment]
|
data/lib/oddb2xml/proxy_check.rb
CHANGED
|
@@ -43,6 +43,46 @@ module Oddb2xml
|
|
|
43
43
|
hosts
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
+
# Full union of every host any run could need, regardless of options.
|
|
47
|
+
# Used by --proxy-check so the report covers everything in one go.
|
|
48
|
+
def all_hosts
|
|
49
|
+
BASE_HOSTS.merge(
|
|
50
|
+
"epl.bag.admin.ch" => "BAG FHIR data (--fhir)",
|
|
51
|
+
"id.gs1.ch" => "GS1 NONPHARMA (--firstbase / -b)",
|
|
52
|
+
"www.spezialitaetenliste.ch" => "BAG Spezialitätenliste",
|
|
53
|
+
"www.medregbm.admin.ch" => "Medizinalberuferegister (-x address)"
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Probe every host and print a full OK/BLOCKED/UNREACHABLE table.
|
|
58
|
+
# Returns true when all hosts are reachable. Used by `oddb2xml --proxy-check`.
|
|
59
|
+
def report(_options = {})
|
|
60
|
+
proxy = proxy_uri
|
|
61
|
+
results = all_hosts.map do |host, desc|
|
|
62
|
+
Thread.new { [host, desc, check_host(host, proxy)] }
|
|
63
|
+
end.map(&:value).sort_by { |(host, _desc, _status)| host }
|
|
64
|
+
|
|
65
|
+
header = "oddb2xml connectivity check"
|
|
66
|
+
header += proxy ? " (via proxy #{proxy.host}:#{proxy.port})" : " (no proxy configured)"
|
|
67
|
+
puts header
|
|
68
|
+
results.each do |(host, desc, status)|
|
|
69
|
+
tag = case status
|
|
70
|
+
when :ok then "OK "
|
|
71
|
+
when :blocked then "BLOCKED" # proxy returned 407
|
|
72
|
+
else "UNREACH"
|
|
73
|
+
end
|
|
74
|
+
puts format(" [%s] %-28s %s", tag, host, desc)
|
|
75
|
+
end
|
|
76
|
+
unreachable = results.reject { |(_host, _desc, status)| status == :ok }
|
|
77
|
+
if unreachable.empty?
|
|
78
|
+
puts "All #{results.size} hosts reachable."
|
|
79
|
+
true
|
|
80
|
+
else
|
|
81
|
+
puts "#{unreachable.size} of #{results.size} host(s) NOT reachable -- downloads using them will fail."
|
|
82
|
+
false
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
46
86
|
# Returns :ok, :blocked (proxy 407) or :unreachable for a single host.
|
|
47
87
|
def check_host(host, proxy)
|
|
48
88
|
http =
|
|
@@ -85,24 +125,24 @@ module Oddb2xml
|
|
|
85
125
|
|
|
86
126
|
def warn_about(problems, proxy)
|
|
87
127
|
line = "=" * 72
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
128
|
+
warn line
|
|
129
|
+
warn " oddb2xml CONNECTIVITY WARNING"
|
|
130
|
+
warn " The following hosts could not be reached -- the corresponding"
|
|
131
|
+
warn " downloads will FAIL or produce incomplete data:"
|
|
92
132
|
problems.each do |(host, desc, status)|
|
|
93
133
|
tag = (status == :blocked) ? "BLOCKED by proxy (407)" : "UNREACHABLE "
|
|
94
|
-
|
|
134
|
+
warn format(" [%s] %-26s %s", tag, host, desc)
|
|
95
135
|
end
|
|
96
136
|
if proxy
|
|
97
|
-
|
|
98
|
-
|
|
137
|
+
warn ""
|
|
138
|
+
warn " Proxy in use: #{proxy.host}:#{proxy.port}"
|
|
99
139
|
if problems.any? { |(_h, _d, s)| s == :blocked }
|
|
100
|
-
|
|
101
|
-
|
|
140
|
+
warn " This looks like an allow-list proxy. Ask your admin to allow the"
|
|
141
|
+
warn " hosts above (HTTPS/443), or set credentials in http(s)_proxy."
|
|
102
142
|
end
|
|
103
143
|
end
|
|
104
|
-
|
|
105
|
-
|
|
144
|
+
warn " (Set ODDB2XML_SKIP_PROXY_CHECK=1 to silence this check.)"
|
|
145
|
+
warn line
|
|
106
146
|
end
|
|
107
147
|
end
|
|
108
148
|
end
|
data/lib/oddb2xml/util.rb
CHANGED
|
@@ -3,10 +3,29 @@ require "htmlentities"
|
|
|
3
3
|
|
|
4
4
|
module Oddb2xml
|
|
5
5
|
FAKE_GTIN_START = "999999"
|
|
6
|
+
|
|
7
|
+
# Raised when a downloaded archive (zip/xlsx) is empty or truncated, so the
|
|
8
|
+
# caller can retry the download instead of crashing later in the parser.
|
|
9
|
+
class IncompleteDownloadError < StandardError; end
|
|
10
|
+
|
|
6
11
|
def self.gen_prodno(iksnr, seqnr)
|
|
7
12
|
sprintf("%05d", iksnr) + sprintf("%02d", seqnr)
|
|
8
13
|
end
|
|
9
14
|
|
|
15
|
+
# True only for a *complete* ZIP/xlsx: a valid local-file header (PK) plus the
|
|
16
|
+
# End Of Central Directory record (PK\x05\x06) near the end. A truncated
|
|
17
|
+
# download keeps the header (so `file` still says "Microsoft Excel 2007+") but
|
|
18
|
+
# loses the EOCD, which makes rubyzip fail with the cryptic "end of central
|
|
19
|
+
# directory signature not found". See issue #121.
|
|
20
|
+
def self.valid_zip?(file)
|
|
21
|
+
return false unless file && File.exist?(file)
|
|
22
|
+
size = File.size(file)
|
|
23
|
+
return false unless size > 100
|
|
24
|
+
return false unless File.binread(file, 2) == "PK"
|
|
25
|
+
offset = [size - 66_000, 0].max
|
|
26
|
+
File.binread(file, size - offset, offset).include?("PK\x05\x06".b)
|
|
27
|
+
end
|
|
28
|
+
|
|
10
29
|
def self.uri_open(url, max_retries: 3)
|
|
11
30
|
retries = 0
|
|
12
31
|
begin
|
data/lib/oddb2xml/version.rb
CHANGED