oddb2xml 3.0.3 → 3.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/CLAUDE.md +6 -2
- data/Gemfile.lock +90 -69
- data/History.txt +3 -0
- data/README.md +32 -3
- data/lib/oddb2xml/builder.rb +30 -0
- data/lib/oddb2xml/downloader.rb +3 -3
- data/lib/oddb2xml/extractor.rb +16 -20
- data/lib/oddb2xml/options.rb +1 -1
- data/lib/oddb2xml/refdata_cleanup.rb +34 -0
- data/lib/oddb2xml/version.rb +1 -1
- data/lib/oddb2xml.rb +1 -0
- data/spec/refdata_cleanup_spec.rb +151 -0
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ad986fc08f7abf1826b696378727e91dcb94cc53d5cdcdc2e0d009b755b92410
|
|
4
|
+
data.tar.gz: 7df4f478dc0cb3cfbd2dfcb796b87ad047aadfdbc37bc9eab80a58b9943078b1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9b7a3266179a31992706cd2d3c999cc73b94cacf293617935475d801c1cc00ed5d23ed91daec53cf2c4f4a234d7057dfe82390980465f9b5e7a19bbff6bcca6c
|
|
7
|
+
data.tar.gz: d223b395fa978c6361c3f6c76ab2111558b029f97e9fad09b2c53d379f24f42f46c9b667da531c7505bcf7c121b5bd21cdd588ec64b530e80c718654c605a27f
|
data/.ruby-version
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
3.
|
|
1
|
+
3.4.5
|
data/CLAUDE.md
CHANGED
|
@@ -37,7 +37,7 @@ The system follows a **download → extract → build → compress** pipeline:
|
|
|
37
37
|
|
|
38
38
|
1. **CLI** (`lib/oddb2xml/cli.rb`) — Entry point. Parses options via Optimist (`options.rb`), orchestrates the pipeline, manages multi-threaded downloads.
|
|
39
39
|
|
|
40
|
-
2. **Downloaders**
|
|
40
|
+
2. **Downloaders** — 11 subclasses of `Downloader`, each fetching from a specific Swiss data source. 10 live in `lib/oddb2xml/downloader.rb`; the FHIR downloader lives in `lib/oddb2xml/fhir_support.rb`. Files cached in `./downloads/`.
|
|
41
41
|
|
|
42
42
|
3. **Extractors** (`lib/oddb2xml/extractor.rb`) — Matching extractor classes that parse downloaded files into Ruby hashes. Formats include XML (nokogiri/sax-machine), XLSX (rubyXL), CSV, and fixed-width text. Refdata uses the new SwissReg XML format from a zip download (`files.refdata.ch`).
|
|
43
43
|
|
|
@@ -47,6 +47,10 @@ The system follows a **download → extract → build → compress** pipeline:
|
|
|
47
47
|
|
|
48
48
|
6. **Compressor** (`lib/oddb2xml/compressor.rb`) — Optional ZIP/TAR.GZ output compression.
|
|
49
49
|
|
|
50
|
+
7. **FHIR support** (`lib/oddb2xml/fhir_support.rb`) — Self-contained module providing `FhirDownloader` and FHIR NDJSON parsing. Activated via `--fhir` (or `--fhir-url=<URL>`). Downloads per-language NDJSON files (`foph-sl-export-latest-{de,fr,it}.ndjson`) from `epl.bag.admin.ch` to populate French and Italian product names/descriptions. Maps legal status codes `756005022007` and `756005022008` to Swissmedic category D.
|
|
51
|
+
|
|
52
|
+
8. **Refdata cleanup** (`lib/oddb2xml/refdata_cleanup.rb`) — Compensates for known data-quality issues in upstream Refdata.Articles.xml before they reach the output. Each fix is guarded by a Swissmedic-side heuristic (e.g. comma in `substance_swissmedic` to distinguish mono products from real combinations). Currently fixes the doubled-dose template bug (`X mg / X mg / Stk`). Called from `Builder#apply_refdata_description_cleanups!` at the start of `prepare_articles`. See GitHub issue #112 for the catalogue.
|
|
53
|
+
|
|
50
54
|
### Key data identifiers
|
|
51
55
|
- **GTIN/EAN13**: Primary article identifier (13-digit barcode)
|
|
52
56
|
- **Pharmacode**: Swiss pharmacy code
|
|
@@ -66,4 +70,4 @@ YAML files in `data/` provide manual overrides and mappings: `article_overrides.
|
|
|
66
70
|
## Ruby Version
|
|
67
71
|
|
|
68
72
|
- Minimum: Ruby >= 2.5.0 (gemspec)
|
|
69
|
-
- Current development: Ruby 3.
|
|
73
|
+
- Current development: Ruby 3.3.6 (`.ruby-version`)
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
oddb2xml (3.0.
|
|
4
|
+
oddb2xml (3.0.4)
|
|
5
5
|
htmlentities
|
|
6
6
|
httpi
|
|
7
7
|
mechanize (>= 2.8.5)
|
|
@@ -28,32 +28,37 @@ GEM
|
|
|
28
28
|
specs:
|
|
29
29
|
addressable (2.8.8)
|
|
30
30
|
public_suffix (>= 2.0.2, < 8.0)
|
|
31
|
-
akami (1.3.
|
|
31
|
+
akami (1.3.3)
|
|
32
|
+
base64
|
|
32
33
|
gyoku (>= 0.4.0)
|
|
33
34
|
nokogiri
|
|
34
|
-
ast (2.4.
|
|
35
|
+
ast (2.4.3)
|
|
35
36
|
base64 (0.3.0)
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
bigdecimal (4.0.1)
|
|
38
|
+
builder (3.3.0)
|
|
39
|
+
byebug (13.0.0)
|
|
40
|
+
reline (>= 0.6.0)
|
|
38
41
|
coderay (1.1.3)
|
|
39
42
|
connection_pool (3.0.2)
|
|
40
|
-
crack (0.
|
|
43
|
+
crack (1.0.1)
|
|
44
|
+
bigdecimal
|
|
41
45
|
rexml
|
|
42
|
-
diff-lcs (1.
|
|
46
|
+
diff-lcs (1.6.2)
|
|
43
47
|
domain_name (0.6.20240107)
|
|
44
|
-
flexmock (
|
|
48
|
+
flexmock (3.0.2)
|
|
45
49
|
gyoku (1.4.0)
|
|
46
50
|
builder (>= 2.1.2)
|
|
47
51
|
rexml (~> 3.0)
|
|
48
|
-
hashdiff (1.
|
|
49
|
-
htmlentities (4.
|
|
52
|
+
hashdiff (1.2.1)
|
|
53
|
+
htmlentities (4.4.2)
|
|
50
54
|
http-cookie (1.1.0)
|
|
51
55
|
domain_name (~> 0.5)
|
|
52
56
|
httpi (2.5.0)
|
|
53
57
|
rack
|
|
54
58
|
socksify
|
|
55
|
-
|
|
56
|
-
|
|
59
|
+
io-console (0.8.2)
|
|
60
|
+
json (2.18.1)
|
|
61
|
+
language_server-protocol (3.17.0.5)
|
|
57
62
|
lint_roller (1.1.0)
|
|
58
63
|
logger (1.7.0)
|
|
59
64
|
mechanize (2.14.0)
|
|
@@ -69,84 +74,95 @@ GEM
|
|
|
69
74
|
rubyntlm (~> 0.6, >= 0.6.3)
|
|
70
75
|
webrick (~> 1.7)
|
|
71
76
|
webrobots (~> 0.1.2)
|
|
72
|
-
method_source (1.
|
|
77
|
+
method_source (1.1.0)
|
|
73
78
|
mime-types (3.7.0)
|
|
74
79
|
logger
|
|
75
80
|
mime-types-data (~> 3.2025, >= 3.2025.0507)
|
|
76
81
|
mime-types-data (3.2026.0203)
|
|
77
82
|
mini_portile2 (2.8.9)
|
|
78
|
-
minitar (0
|
|
79
|
-
multi_json (1.
|
|
83
|
+
minitar (1.1.0)
|
|
84
|
+
multi_json (1.19.1)
|
|
80
85
|
mutex_m (0.3.0)
|
|
81
86
|
net-http-digest_auth (1.4.1)
|
|
82
87
|
net-http-persistent (4.0.8)
|
|
83
88
|
connection_pool (>= 2.2.4, < 4)
|
|
84
89
|
nkf (0.2.0)
|
|
85
|
-
nokogiri (1.19.
|
|
90
|
+
nokogiri (1.19.2)
|
|
86
91
|
mini_portile2 (~> 2.8.2)
|
|
87
92
|
racc (~> 1.4)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
nokogiri (1.19.2-arm64-darwin)
|
|
94
|
+
racc (~> 1.4)
|
|
95
|
+
nori (2.7.1)
|
|
96
|
+
bigdecimal
|
|
97
|
+
optimist (3.2.1)
|
|
98
|
+
ox (2.14.23)
|
|
99
|
+
bigdecimal (>= 3.0)
|
|
100
|
+
parallel (1.27.0)
|
|
101
|
+
parser (3.3.10.2)
|
|
93
102
|
ast (~> 2.4.1)
|
|
94
103
|
racc
|
|
95
104
|
parslet (2.0.0)
|
|
96
|
-
|
|
105
|
+
prism (1.9.0)
|
|
106
|
+
pry (0.16.0)
|
|
97
107
|
coderay (~> 1.1)
|
|
98
108
|
method_source (~> 1.0)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
109
|
+
reline (>= 0.6.0)
|
|
110
|
+
pry-byebug (3.12.0)
|
|
111
|
+
byebug (~> 13.0)
|
|
112
|
+
pry (>= 0.13, < 0.17)
|
|
113
|
+
pry-doc (1.7.0)
|
|
103
114
|
pry (~> 0.11)
|
|
104
|
-
yard (~> 0.9.
|
|
115
|
+
yard (~> 0.9.21)
|
|
105
116
|
psych (3.3.4)
|
|
106
117
|
public_suffix (7.0.2)
|
|
107
118
|
racc (1.8.1)
|
|
108
|
-
rack (3.2.
|
|
119
|
+
rack (3.2.6)
|
|
109
120
|
rainbow (3.1.1)
|
|
110
|
-
rake (13.
|
|
121
|
+
rake (13.3.1)
|
|
111
122
|
rdoc (6.3.4.1)
|
|
112
|
-
regexp_parser (2.
|
|
123
|
+
regexp_parser (2.11.3)
|
|
124
|
+
reline (0.6.3)
|
|
125
|
+
io-console (~> 0.5)
|
|
113
126
|
rexml (3.4.4)
|
|
114
|
-
rspec (3.
|
|
115
|
-
rspec-core (~> 3.
|
|
116
|
-
rspec-expectations (~> 3.
|
|
117
|
-
rspec-mocks (~> 3.
|
|
118
|
-
rspec-core (3.
|
|
119
|
-
rspec-support (~> 3.
|
|
120
|
-
rspec-expectations (3.
|
|
127
|
+
rspec (3.13.2)
|
|
128
|
+
rspec-core (~> 3.13.0)
|
|
129
|
+
rspec-expectations (~> 3.13.0)
|
|
130
|
+
rspec-mocks (~> 3.13.0)
|
|
131
|
+
rspec-core (3.13.6)
|
|
132
|
+
rspec-support (~> 3.13.0)
|
|
133
|
+
rspec-expectations (3.13.5)
|
|
121
134
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
122
|
-
rspec-support (~> 3.
|
|
123
|
-
rspec-mocks (3.
|
|
135
|
+
rspec-support (~> 3.13.0)
|
|
136
|
+
rspec-mocks (3.13.7)
|
|
124
137
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
125
|
-
rspec-support (~> 3.
|
|
126
|
-
rspec-support (3.
|
|
127
|
-
rubocop (1.
|
|
138
|
+
rspec-support (~> 3.13.0)
|
|
139
|
+
rspec-support (3.13.7)
|
|
140
|
+
rubocop (1.84.2)
|
|
128
141
|
json (~> 2.3)
|
|
142
|
+
language_server-protocol (~> 3.17.0.2)
|
|
143
|
+
lint_roller (~> 1.1.0)
|
|
129
144
|
parallel (~> 1.10)
|
|
130
|
-
parser (>= 3.
|
|
145
|
+
parser (>= 3.3.0.2)
|
|
131
146
|
rainbow (>= 2.2.2, < 4.0)
|
|
132
|
-
regexp_parser (>=
|
|
133
|
-
|
|
134
|
-
rubocop-ast (>= 1.28.0, < 2.0)
|
|
147
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
|
148
|
+
rubocop-ast (>= 1.49.0, < 2.0)
|
|
135
149
|
ruby-progressbar (~> 1.7)
|
|
136
|
-
unicode-display_width (>= 2.4.0, <
|
|
137
|
-
rubocop-ast (1.
|
|
138
|
-
parser (>= 3.
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
150
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
|
151
|
+
rubocop-ast (1.49.0)
|
|
152
|
+
parser (>= 3.3.7.2)
|
|
153
|
+
prism (~> 1.7)
|
|
154
|
+
rubocop-performance (1.26.1)
|
|
155
|
+
lint_roller (~> 1.1)
|
|
156
|
+
rubocop (>= 1.75.0, < 2.0)
|
|
157
|
+
rubocop-ast (>= 1.47.1, < 2.0)
|
|
158
|
+
ruby-ole (1.2.13.1)
|
|
143
159
|
ruby-progressbar (1.13.0)
|
|
144
|
-
rubyXL (3.4.
|
|
160
|
+
rubyXL (3.4.33)
|
|
145
161
|
nokogiri (>= 1.10.8)
|
|
146
162
|
rubyzip (>= 1.3.0)
|
|
147
163
|
rubyntlm (0.6.5)
|
|
148
164
|
base64
|
|
149
|
-
rubyzip (3.0.
|
|
165
|
+
rubyzip (3.0.2)
|
|
150
166
|
savon (2.12.1)
|
|
151
167
|
akami (~> 1.2)
|
|
152
168
|
builder (>= 2.1.2)
|
|
@@ -156,31 +172,35 @@ GEM
|
|
|
156
172
|
nori (~> 2.4)
|
|
157
173
|
wasabi (~> 3.4)
|
|
158
174
|
sax-machine (1.3.2)
|
|
159
|
-
socksify (1.
|
|
160
|
-
spreadsheet (1.3.
|
|
175
|
+
socksify (1.8.1)
|
|
176
|
+
spreadsheet (1.3.4)
|
|
177
|
+
bigdecimal
|
|
178
|
+
logger
|
|
161
179
|
ruby-ole
|
|
162
|
-
standard (1.
|
|
180
|
+
standard (1.54.0)
|
|
163
181
|
language_server-protocol (~> 3.17.0.2)
|
|
164
182
|
lint_roller (~> 1.0)
|
|
165
|
-
rubocop (~> 1.
|
|
183
|
+
rubocop (~> 1.84.0)
|
|
166
184
|
standard-custom (~> 1.0.0)
|
|
167
|
-
standard-performance (~> 1.
|
|
185
|
+
standard-performance (~> 1.8)
|
|
168
186
|
standard-custom (1.0.2)
|
|
169
187
|
lint_roller (~> 1.0)
|
|
170
188
|
rubocop (~> 1.50)
|
|
171
|
-
standard-performance (1.0
|
|
172
|
-
lint_roller (~> 1.
|
|
173
|
-
rubocop-performance (~> 1.
|
|
189
|
+
standard-performance (1.9.0)
|
|
190
|
+
lint_roller (~> 1.1)
|
|
191
|
+
rubocop-performance (~> 1.26.0)
|
|
174
192
|
standardrb (1.0.1)
|
|
175
193
|
standard
|
|
176
|
-
timecop (0.9.
|
|
177
|
-
unicode-display_width (2.
|
|
178
|
-
|
|
194
|
+
timecop (0.9.10)
|
|
195
|
+
unicode-display_width (3.2.0)
|
|
196
|
+
unicode-emoji (~> 4.1)
|
|
197
|
+
unicode-emoji (4.2.0)
|
|
198
|
+
vcr (6.4.0)
|
|
179
199
|
wasabi (3.7.0)
|
|
180
200
|
addressable
|
|
181
201
|
httpi (~> 2.0)
|
|
182
202
|
nokogiri (>= 1.4.2)
|
|
183
|
-
webmock (3.
|
|
203
|
+
webmock (3.26.1)
|
|
184
204
|
addressable (>= 2.8.0)
|
|
185
205
|
crack (>= 0.3.2)
|
|
186
206
|
hashdiff (>= 0.4.0, < 2.0.0)
|
|
@@ -188,9 +208,10 @@ GEM
|
|
|
188
208
|
webrobots (0.1.2)
|
|
189
209
|
xml-simple (1.1.9)
|
|
190
210
|
rexml
|
|
191
|
-
yard (0.9.
|
|
211
|
+
yard (0.9.38)
|
|
192
212
|
|
|
193
213
|
PLATFORMS
|
|
214
|
+
arm64-darwin-25
|
|
194
215
|
ruby
|
|
195
216
|
|
|
196
217
|
DEPENDENCIES
|
|
@@ -211,4 +232,4 @@ DEPENDENCIES
|
|
|
211
232
|
webmock
|
|
212
233
|
|
|
213
234
|
BUNDLED WITH
|
|
214
|
-
2.
|
|
235
|
+
2.5.22
|
data/History.txt
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
=== 3.0.4 / 24.04.2026
|
|
2
|
+
* Firstbase: switch -b/--firstbase from the deprecated pillbox.oddb.org XLSX to the GS1 Switzerland CSV at https://id.gs1.ch/01/07612345000961 (full firstbase barcode registry, ~189k items). Downloaded file is now firstbase.csv; FirstbaseExtractor now parses CSV with headers instead of XLSX.
|
|
3
|
+
|
|
1
4
|
=== 3.0.3 / 24.04.2026
|
|
2
5
|
* FHIR: download per-language NDJSON files (foph-sl-export-latest-{de,fr,it}.ndjson) so French and Italian product names/descriptions are populated
|
|
3
6
|
* FHIR: map legal status code 756005022008 to Swissmedic category D (in addition to 756005022007)
|
data/README.md
CHANGED
|
@@ -51,7 +51,7 @@ HIN (http://hin.ch) creates daily the actual file. They can be downloaded from `
|
|
|
51
51
|
see `--help`.
|
|
52
52
|
|
|
53
53
|
```
|
|
54
|
-
/opt/src/
|
|
54
|
+
/opt/src/oddb2xml/bin/oddb2xml version 3.0.5
|
|
55
55
|
Usage:
|
|
56
56
|
oddb2xml [option]
|
|
57
57
|
produced files are found under data
|
|
@@ -61,6 +61,10 @@ see `--help`.
|
|
|
61
61
|
-e, --extended pharma, non-pharma plus prices and non-pharma from zurrose.
|
|
62
62
|
Products without EAN-Code will also be listed.
|
|
63
63
|
File oddb_calc.xml will also be generated
|
|
64
|
+
--fhir Use FHIR NDJSON format from FOPH/BAG instead of XML
|
|
65
|
+
from Spezialitätenliste. Downloads per-language
|
|
66
|
+
NDJSON files (de, fr, it) from epl.bag.admin.ch.
|
|
67
|
+
--fhir-url=<s> Specific FHIR NDJSON URL to download (implies --fhir)
|
|
64
68
|
-f, --format=<s> File format F, default is xml. {xml|dat}
|
|
65
69
|
If F is given, -o option is ignored. (Default: xml)
|
|
66
70
|
-i, --include Include target option for ean14 for 'dat' format.
|
|
@@ -80,6 +84,8 @@ see `--help`.
|
|
|
80
84
|
Downloaded files are saved under downloads
|
|
81
85
|
--log log important actions
|
|
82
86
|
-u, --use-ra11zip=<s> Use the ra11.zip (a zipped transfer.dat from Galexis)
|
|
87
|
+
-b, --firstbase Build all NONPHARMA articles on firstbase
|
|
88
|
+
(GS1 Switzerland CSV from id.gs1.ch)
|
|
83
89
|
-v, --version Print version and exit
|
|
84
90
|
-h, --help Show this message
|
|
85
91
|
```
|
|
@@ -106,8 +112,8 @@ FR
|
|
|
106
112
|
|
|
107
113
|
## Supported ruby version
|
|
108
114
|
|
|
109
|
-
|
|
110
|
-
|
|
115
|
+
You will need ruby >= 2.5 to work correctly. Current development happens on Ruby 3.3 (`.ruby-version`).
|
|
116
|
+
CI runs on Ruby 3.0, 3.1 and 3.2 via GitHub Actions — see the badge above for the latest spec results.
|
|
111
117
|
|
|
112
118
|
|
|
113
119
|
## XSD files
|
|
@@ -283,6 +289,29 @@ We use the following files:
|
|
|
283
289
|
* http://download.swissmedicinfo.ch/ (AipsDownload)
|
|
284
290
|
* https://raw.githubusercontent.com/zdavatz/oddb2xml_files/master/LPPV.txt
|
|
285
291
|
* https://raw.githubusercontent.com/zdavatz/cpp2sqlite/master/input/atc_codes_multi_lingual.txt
|
|
292
|
+
* https://epl.bag.admin.ch/static/fhir/foph-sl-export-latest-{de,fr,it}.ndjson (FHIR NDJSON, used with `--fhir`)
|
|
293
|
+
* https://id.gs1.ch/01/07612345000961 (GS1 Switzerland firstbase CSV — full barcode registry, used with `-b`/`--firstbase`)
|
|
294
|
+
|
|
295
|
+
## Refdata data-quality compensation
|
|
296
|
+
|
|
297
|
+
Refdata.Articles.xml from `files.refdata.ch` ships with a number of recurring
|
|
298
|
+
data-quality issues that propagate into downstream systems unchanged. oddb2xml
|
|
299
|
+
applies a small set of conservative cleanups before emitting any output. See
|
|
300
|
+
GitHub issue [#112](https://github.com/zdavatz/oddb2xml/issues/112) for the
|
|
301
|
+
full catalogue and the parallel report sent to Refdata.
|
|
302
|
+
|
|
303
|
+
Currently active fixes (`lib/oddb2xml/refdata_cleanup.rb`):
|
|
304
|
+
|
|
305
|
+
* **Doubled dose token** — Refdata sometimes emits the strength twice in
|
|
306
|
+
`<FullName>`, e.g. `MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette`.
|
|
307
|
+
When the matching Swissmedic entry shows a single active substance, the
|
|
308
|
+
duplicate token is collapsed to a single occurrence. Real combination
|
|
309
|
+
products (e.g. PHESGO 600 mg / 600 mg / 10 ml — pertuzumab + trastuzumab)
|
|
310
|
+
are detected via the comma in `substance_swissmedic` and left untouched.
|
|
311
|
+
|
|
312
|
+
The cleanup runs at the start of `prepare_articles` in `Builder` and is
|
|
313
|
+
idempotent. Each rule is guarded by a Swissmedic-side heuristic so genuine
|
|
314
|
+
data is never altered.
|
|
286
315
|
|
|
287
316
|
## Rules for matching GTIN (aka EAN13), product number and IKSNR
|
|
288
317
|
|
data/lib/oddb2xml/builder.rb
CHANGED
|
@@ -88,12 +88,42 @@ module Oddb2xml
|
|
|
88
88
|
end
|
|
89
89
|
end
|
|
90
90
|
|
|
91
|
+
# Mutates @refdata in place to compensate for known Refdata.Articles.xml
|
|
92
|
+
# data-quality issues (see GitHub issue #112). Idempotent: subsequent
|
|
93
|
+
# calls are no-ops within the same Builder instance.
|
|
94
|
+
def apply_refdata_description_cleanups!
|
|
95
|
+
return if @refdata_descriptions_cleaned
|
|
96
|
+
@refdata_descriptions_cleaned = true
|
|
97
|
+
return if @refdata.nil? || @refdata.empty?
|
|
98
|
+
double_dose_fixed = 0
|
|
99
|
+
@refdata.each_value do |item|
|
|
100
|
+
next unless item.is_a?(Hash)
|
|
101
|
+
no8 = item[:no8]
|
|
102
|
+
next if no8.nil? || no8.empty?
|
|
103
|
+
pack = @packs[no8]
|
|
104
|
+
next unless pack
|
|
105
|
+
substance = pack[:substance_swissmedic]
|
|
106
|
+
[:desc_de, :desc_fr, :desc_it].each do |key|
|
|
107
|
+
original = item[key]
|
|
108
|
+
cleaned = RefdataCleanup.fix_double_dose(original, substance)
|
|
109
|
+
if cleaned != original
|
|
110
|
+
item[key] = cleaned
|
|
111
|
+
double_dose_fixed += 1
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
if double_dose_fixed > 0
|
|
116
|
+
Oddb2xml.log("Refdata cleanup: fixed double-dose pattern in #{double_dose_fixed} description(s)")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
91
120
|
private_class_method
|
|
92
121
|
|
|
93
122
|
def prepare_articles(reset = false)
|
|
94
123
|
@articles = nil if reset
|
|
95
124
|
unless @articles
|
|
96
125
|
Oddb2xml.log("prepare_articles starting with #{@articles ? @articles.size : "no"} articles.")
|
|
126
|
+
apply_refdata_description_cleanups!
|
|
97
127
|
@articles = []
|
|
98
128
|
@refdata.each do |ean13, obj|
|
|
99
129
|
unless SKIP_MIGEL_DOWNLOADER
|
data/lib/oddb2xml/downloader.rb
CHANGED
|
@@ -342,14 +342,14 @@ module Oddb2xml
|
|
|
342
342
|
end
|
|
343
343
|
|
|
344
344
|
class FirstbaseDownloader < Downloader
|
|
345
|
-
BASE_URL = "
|
|
345
|
+
BASE_URL = "https://id.gs1.ch/01/07612345000961"
|
|
346
346
|
include DownloadMethod
|
|
347
347
|
def initialize(type = :orphan, options = {})
|
|
348
|
-
@url = BASE_URL
|
|
348
|
+
@url = BASE_URL
|
|
349
349
|
end
|
|
350
350
|
|
|
351
351
|
def download
|
|
352
|
-
@file2save = File.join(DOWNLOADS, "firstbase.
|
|
352
|
+
@file2save = File.join(DOWNLOADS, "firstbase.csv")
|
|
353
353
|
report_download(@url, @file2save)
|
|
354
354
|
begin
|
|
355
355
|
download_as(@file2save, "w+")
|
data/lib/oddb2xml/extractor.rb
CHANGED
|
@@ -608,32 +608,28 @@ module Oddb2xml
|
|
|
608
608
|
|
|
609
609
|
class FirstbaseExtractor < Extractor
|
|
610
610
|
def initialize(file)
|
|
611
|
-
@
|
|
611
|
+
@file = file
|
|
612
612
|
end
|
|
613
613
|
|
|
614
614
|
def to_hash
|
|
615
615
|
data = {}
|
|
616
|
-
return data unless @
|
|
617
|
-
@
|
|
618
|
-
|
|
619
|
-
if
|
|
620
|
-
puts "Empty row (#{i}) in firstbase"
|
|
621
|
-
next
|
|
622
|
-
end
|
|
623
|
-
gtin = row[0].value.to_s.gsub(/^0+/, '')
|
|
616
|
+
return data unless @file && File.exist?(@file)
|
|
617
|
+
CSV.foreach(@file, headers: true, encoding: "UTF-8") do |row|
|
|
618
|
+
gtin = row["Gtin"].to_s.gsub(/^0+/, "")
|
|
619
|
+
next if gtin.empty?
|
|
624
620
|
data[gtin] = {
|
|
625
621
|
gtin: gtin,
|
|
626
|
-
gln: row[
|
|
627
|
-
target_market: row[
|
|
628
|
-
gpc: row[
|
|
629
|
-
trade_item_description_de: row[
|
|
630
|
-
trade_item_description_en:
|
|
631
|
-
trade_item_description_fr: row[
|
|
632
|
-
trade_item_description_it: row[
|
|
633
|
-
manufacturer_name: row[
|
|
634
|
-
start_availability_date: row[
|
|
635
|
-
gross_weight:
|
|
636
|
-
net_weight:
|
|
622
|
+
gln: row["InformationProviderGln"].to_s,
|
|
623
|
+
target_market: row["TargetMarketCountryCode"].to_s,
|
|
624
|
+
gpc: row["GpcCategoryCode"].to_s,
|
|
625
|
+
trade_item_description_de: row["TradeItemDescription_DE"].to_s,
|
|
626
|
+
trade_item_description_en: "",
|
|
627
|
+
trade_item_description_fr: row["TradeItemDescription_FR"].to_s,
|
|
628
|
+
trade_item_description_it: row["TradeItemDescription_IT"].to_s,
|
|
629
|
+
manufacturer_name: row["InformationProviderPartyName"].to_s,
|
|
630
|
+
start_availability_date: row["Date_Created_Batch"].to_s,
|
|
631
|
+
gross_weight: "",
|
|
632
|
+
net_weight: "",
|
|
637
633
|
}
|
|
638
634
|
end
|
|
639
635
|
data
|
data/lib/oddb2xml/options.rb
CHANGED
|
@@ -45,7 +45,7 @@ module Oddb2xml
|
|
|
45
45
|
opt :log, "log important actions", short: :none
|
|
46
46
|
opt :use_ra11zip, "Use the ra11.zip (a zipped transfer.dat from Galexis)",
|
|
47
47
|
default: File.exist?("ra11.zip") ? "ra11.zip" : nil, type: :string
|
|
48
|
-
opt :firstbase, "Build all NONPHARMA articles on firstbase", short: "b", default: false
|
|
48
|
+
opt :firstbase, "Build all NONPHARMA articles on firstbase (GS1 Switzerland CSV from id.gs1.ch)", short: "b", default: false
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
@opts[:percent] = @opts[:increment]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
module Oddb2xml
|
|
2
|
+
# Compensates for known data-quality issues in upstream Refdata.Articles.xml
|
|
3
|
+
# before they reach the generated output. Each fix is opt-in and guarded by
|
|
4
|
+
# a heuristic against Swissmedic data so we never alter genuine combination
|
|
5
|
+
# products. See GitHub issue #112 for the catalogue of upstream problems.
|
|
6
|
+
module RefdataCleanup
|
|
7
|
+
DOSE_TOKEN = /\d+(?:[.,]\d+)?\s*(?:mg|µg|mcg|g|ml|UI|U\.I\.|IE|%)/i
|
|
8
|
+
# Matches "<dose> / <same dose> /" – the templating bug where Refdata
|
|
9
|
+
# repeats the strength once. The backreference \1 only matches when the
|
|
10
|
+
# exact same dose string appears twice, which keeps real combos
|
|
11
|
+
# (e.g. PHESGO 600 mg / 600 mg / 10 ml) safe – those are caught by the
|
|
12
|
+
# single_substance? guard, but the literal-match also acts as a backstop.
|
|
13
|
+
DOUBLE_DOSE_RE = /(#{DOSE_TOKEN})\s*\/\s*\1\s*\/\s*/
|
|
14
|
+
|
|
15
|
+
# A Swissmedic compositions cell like "mirtazapinum" indicates a mono
|
|
16
|
+
# product; "atovaquonum, proguanili hydrochloridum" or
|
|
17
|
+
# "pertuzumabum, trastuzumabum" indicates a real combination.
|
|
18
|
+
def self.single_substance?(swissmedic_substance)
|
|
19
|
+
return false if swissmedic_substance.nil?
|
|
20
|
+
str = swissmedic_substance.to_s.strip
|
|
21
|
+
return false if str.empty?
|
|
22
|
+
!str.include?(",")
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Removes the duplicated dose token in mono products. Returns the
|
|
26
|
+
# cleaned description, or the original string if no change applies.
|
|
27
|
+
def self.fix_double_dose(desc, swissmedic_substance)
|
|
28
|
+
return desc if desc.nil? || desc.empty?
|
|
29
|
+
return desc unless DOUBLE_DOSE_RE.match?(desc)
|
|
30
|
+
return desc unless single_substance?(swissmedic_substance)
|
|
31
|
+
desc.sub(DOUBLE_DOSE_RE, '\1 / ')
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
data/lib/oddb2xml/version.rb
CHANGED
data/lib/oddb2xml.rb
CHANGED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
require "oddb2xml/refdata_cleanup"
|
|
3
|
+
|
|
4
|
+
describe Oddb2xml::RefdataCleanup do
|
|
5
|
+
describe ".single_substance?" do
|
|
6
|
+
it "returns true for a single Swissmedic substance" do
|
|
7
|
+
expect(described_class.single_substance?("mirtazapinum")).to be true
|
|
8
|
+
expect(described_class.single_substance?("methotrexatum")).to be true
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it "returns false when multiple substances are listed (combo)" do
|
|
12
|
+
expect(described_class.single_substance?("pertuzumabum, trastuzumabum")).to be false
|
|
13
|
+
expect(described_class.single_substance?("atovaquonum, proguanili hydrochloridum")).to be false
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "returns false when input is nil or empty" do
|
|
17
|
+
expect(described_class.single_substance?(nil)).to be false
|
|
18
|
+
expect(described_class.single_substance?("")).to be false
|
|
19
|
+
expect(described_class.single_substance?(" ")).to be false
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
describe ".fix_double_dose" do
|
|
24
|
+
let(:mono) { "mirtazapinum" }
|
|
25
|
+
let(:combo) { "pertuzumabum, trastuzumabum" }
|
|
26
|
+
|
|
27
|
+
it "removes the duplicate dose for a mono product" do
|
|
28
|
+
input = "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette"
|
|
29
|
+
expected = "MIRTAZAPIN Sandoz eco 30 mg / 100 Tablette"
|
|
30
|
+
expect(described_class.fix_double_dose(input, mono)).to eq expected
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "handles ICATIBANT-style spacing" do
|
|
34
|
+
input = "ICATIBANT Spirig HC 30 mg / 30 mg / 1 x 3 ml"
|
|
35
|
+
expected = "ICATIBANT Spirig HC 30 mg / 1 x 3 ml"
|
|
36
|
+
expect(described_class.fix_double_dose(input, mono)).to eq expected
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it "leaves real combinations untouched (PHESGO 600 mg / 600 mg / 10 ml)" do
|
|
40
|
+
input = "PHESGO Inj Lös 600 mg/600 mg/10 ml Durchstf"
|
|
41
|
+
expect(described_class.fix_double_dose(input, combo)).to eq input
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "leaves descriptions without the double-dose pattern untouched" do
|
|
45
|
+
input = "LEVOCETIRIZIN Spirig HC Filmtabl 5 mg 10 Stk"
|
|
46
|
+
expect(described_class.fix_double_dose(input, mono)).to eq input
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
it "leaves the description untouched when Swissmedic substance is unknown" do
|
|
50
|
+
input = "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette"
|
|
51
|
+
expect(described_class.fix_double_dose(input, nil)).to eq input
|
|
52
|
+
expect(described_class.fix_double_dose(input, "")).to eq input
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "is a no-op for nil or empty descriptions" do
|
|
56
|
+
expect(described_class.fix_double_dose(nil, mono)).to be_nil
|
|
57
|
+
expect(described_class.fix_double_dose("", mono)).to eq ""
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it "does not collapse different doses (X mg / Y mg)" do
|
|
61
|
+
input = "FOO 250 mg / 100 mg / 12 Stk"
|
|
62
|
+
expect(described_class.fix_double_dose(input, combo)).to eq input
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
describe Oddb2xml::Builder do
|
|
68
|
+
describe "#apply_refdata_description_cleanups!" do
|
|
69
|
+
let(:builder) { Oddb2xml::Builder.new }
|
|
70
|
+
|
|
71
|
+
it "fixes double-dose entries on mono products" do
|
|
72
|
+
builder.packs = {
|
|
73
|
+
"69475006" => {substance_swissmedic: "mirtazapinum"}
|
|
74
|
+
}
|
|
75
|
+
builder.refdata = {
|
|
76
|
+
"7680694750066" => {
|
|
77
|
+
ean13: "7680694750066",
|
|
78
|
+
no8: "69475006",
|
|
79
|
+
desc_de: "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette",
|
|
80
|
+
desc_fr: "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 comprimé(",
|
|
81
|
+
desc_it: ""
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
builder.apply_refdata_description_cleanups!
|
|
86
|
+
|
|
87
|
+
item = builder.refdata["7680694750066"]
|
|
88
|
+
expect(item[:desc_de]).to eq "MIRTAZAPIN Sandoz eco 30 mg / 100 Tablette"
|
|
89
|
+
expect(item[:desc_fr]).to eq "MIRTAZAPIN Sandoz eco 30 mg / 100 comprimé("
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "leaves combo products untouched" do
|
|
93
|
+
builder.packs = {
|
|
94
|
+
"67828001" => {substance_swissmedic: "pertuzumabum, trastuzumabum"}
|
|
95
|
+
}
|
|
96
|
+
original = "PHESGO Inj Lös 600 mg/600 mg/10 ml Durchstf"
|
|
97
|
+
builder.refdata = {
|
|
98
|
+
"7680678280013" => {
|
|
99
|
+
ean13: "7680678280013",
|
|
100
|
+
no8: "67828001",
|
|
101
|
+
desc_de: original,
|
|
102
|
+
desc_fr: "",
|
|
103
|
+
desc_it: ""
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
builder.apply_refdata_description_cleanups!
|
|
108
|
+
|
|
109
|
+
expect(builder.refdata["7680678280013"][:desc_de]).to eq original
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it "is idempotent" do
|
|
113
|
+
builder.packs = {
|
|
114
|
+
"69475006" => {substance_swissmedic: "mirtazapinum"}
|
|
115
|
+
}
|
|
116
|
+
builder.refdata = {
|
|
117
|
+
"7680694750066" => {
|
|
118
|
+
ean13: "7680694750066",
|
|
119
|
+
no8: "69475006",
|
|
120
|
+
desc_de: "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette",
|
|
121
|
+
desc_fr: "",
|
|
122
|
+
desc_it: ""
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
builder.apply_refdata_description_cleanups!
|
|
127
|
+
builder.apply_refdata_description_cleanups!
|
|
128
|
+
|
|
129
|
+
expect(builder.refdata["7680694750066"][:desc_de])
|
|
130
|
+
.to eq "MIRTAZAPIN Sandoz eco 30 mg / 100 Tablette"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "skips entries without a Swissmedic match" do
|
|
134
|
+
builder.packs = {}
|
|
135
|
+
input = "MIRTAZAPIN Sandoz eco 30 mg / 30 mg / 100 Tablette"
|
|
136
|
+
builder.refdata = {
|
|
137
|
+
"7680694750066" => {
|
|
138
|
+
ean13: "7680694750066",
|
|
139
|
+
no8: "69475006",
|
|
140
|
+
desc_de: input,
|
|
141
|
+
desc_fr: "",
|
|
142
|
+
desc_it: ""
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
builder.apply_refdata_description_cleanups!
|
|
147
|
+
|
|
148
|
+
expect(builder.refdata["7680694750066"][:desc_de]).to eq input
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: oddb2xml
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.0.
|
|
4
|
+
version: 3.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yasuhiro Asaka, Zeno R.R. Davatz, Niklaus Giger
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: rubyzip
|
|
@@ -470,6 +469,7 @@ files:
|
|
|
470
469
|
- lib/oddb2xml/fhir_support.rb
|
|
471
470
|
- lib/oddb2xml/options.rb
|
|
472
471
|
- lib/oddb2xml/parslet_compositions.rb
|
|
472
|
+
- lib/oddb2xml/refdata_cleanup.rb
|
|
473
473
|
- lib/oddb2xml/semantic_check.rb
|
|
474
474
|
- lib/oddb2xml/util.rb
|
|
475
475
|
- lib/oddb2xml/version.rb
|
|
@@ -544,6 +544,7 @@ files:
|
|
|
544
544
|
- spec/galenic_spec.rb
|
|
545
545
|
- spec/options_spec.rb
|
|
546
546
|
- spec/parslet_spec.rb
|
|
547
|
+
- spec/refdata_cleanup_spec.rb
|
|
547
548
|
- spec/spec_helper.rb
|
|
548
549
|
- test_options.rb
|
|
549
550
|
- tools/cacert.pem
|
|
@@ -553,7 +554,6 @@ homepage: https://github.com/zdavatz/oddb2xml
|
|
|
553
554
|
licenses:
|
|
554
555
|
- GPL-3.0-only
|
|
555
556
|
metadata: {}
|
|
556
|
-
post_install_message:
|
|
557
557
|
rdoc_options: []
|
|
558
558
|
require_paths:
|
|
559
559
|
- lib
|
|
@@ -568,8 +568,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
568
568
|
- !ruby/object:Gem::Version
|
|
569
569
|
version: '0'
|
|
570
570
|
requirements: []
|
|
571
|
-
rubygems_version: 3.
|
|
572
|
-
signing_key:
|
|
571
|
+
rubygems_version: 3.6.9
|
|
573
572
|
specification_version: 4
|
|
574
573
|
summary: oddb2xml creates xml files.
|
|
575
574
|
test_files:
|
|
@@ -640,4 +639,5 @@ test_files:
|
|
|
640
639
|
- spec/galenic_spec.rb
|
|
641
640
|
- spec/options_spec.rb
|
|
642
641
|
- spec/parslet_spec.rb
|
|
642
|
+
- spec/refdata_cleanup_spec.rb
|
|
643
643
|
- spec/spec_helper.rb
|