oddb2xml 2.0.5 → 2.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +13 -5
- data/Gemfile.lock +26 -19
- data/History.txt +5 -0
- data/dokumentation_calc.textile +50 -0
- data/lib/oddb2xml/builder.rb +9 -4
- data/lib/oddb2xml/calc.rb +1 -1
- data/lib/oddb2xml/cli.rb +1 -0
- data/lib/oddb2xml/compositions_syntax.rb +368 -0
- data/lib/oddb2xml/extractor.rb +13 -4
- data/lib/oddb2xml/parslet_compositions.rb +598 -0
- data/lib/oddb2xml/version.rb +1 -1
- data/oddb2xml.gemspec +1 -0
- data/spec/builder_spec.rb +1 -1
- data/spec/calc_spec.rb +102 -121
- data/spec/composition_syntax_spec.rb +502 -0
- data/spec/data/compositions.txt +8937 -0
- data/spec/data/swissmedic_package-galenic.xlsx +0 -0
- data/spec/data/zurrose_transfer.dat +5 -0
- data/spec/extractor_spec.rb +40 -0
- data/spec/parslet_spec.rb +1268 -0
- data/spec/spec_helper.rb +8 -0
- metadata +56 -34
- data/lib/oddb2xml/parse_compositions.rb +0 -106
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDA5ZTJmMjQ1NzlhMWM0ZjE5YzY5NjA2NWFjNGFlYjQxMjI0NjIzYw==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MmUxZWYwYTBiYjkyMjdiMGMzM2QwZjQwOWI5MTZjZTIxMmQ2YzhhYw==
|
5
7
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MDBiZDcyMzQyZTMzMDhiNDFmNGQxOTAwOTc2MjA1OTY4NjMyNzhiNWE3NzI4
|
10
|
+
ODVlM2RmMTg2MTRkODdjODhlM2ExNDVkOTUyOTgwZTM3MzY1MzA1Y2Q2MmRi
|
11
|
+
NjI4Zjc0OTlkM2M0NTIzMmZlOTlmYzcwMGQ0YTdlMTc5NjZiNjY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
OGY4NThmMTRhNDVhY2ZlYzdmZmYwMThjMGY0NTM5MmJhYjMxYThjOTE4ZTUy
|
14
|
+
MmYwYzlhOWI5OTQ5OWQ0MDllMjk2YmM3OGY3YzUzMDE4MTM0YTYzOTRlY2Uz
|
15
|
+
NzM0NTQwZTlhOWQwZjMwZDlmMGYzMWM2ZmIzZDhlZjlmMDE1MjQ=
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
oddb2xml (2.0.
|
4
|
+
oddb2xml (2.0.6)
|
5
5
|
archive-tar-minitar (~> 0.5.2)
|
6
6
|
mechanize (~> 2.5.1)
|
7
7
|
nokogiri (~> 1.5.10)
|
8
|
+
parslet (~> 1.7.0)
|
8
9
|
rubyXL (~> 3.3.1)
|
9
10
|
rubyzip (~> 1.1.3)
|
10
11
|
savon (~> 2.4.0)
|
@@ -14,11 +15,12 @@ PATH
|
|
14
15
|
GEM
|
15
16
|
remote: https://rubygems.org/
|
16
17
|
specs:
|
17
|
-
addressable (2.3.
|
18
|
+
addressable (2.3.8)
|
18
19
|
akami (1.2.2)
|
19
20
|
gyoku (>= 0.4.0)
|
20
21
|
nokogiri
|
21
22
|
archive-tar-minitar (0.5.2)
|
23
|
+
blankslate (3.1.3)
|
22
24
|
builder (3.2.2)
|
23
25
|
coderay (1.1.0)
|
24
26
|
columnize (0.9.0)
|
@@ -29,15 +31,16 @@ GEM
|
|
29
31
|
debugger-linecache (~> 1.2.0)
|
30
32
|
debugger-ruby_core_source (~> 1.3.5)
|
31
33
|
debugger-linecache (1.2.0)
|
32
|
-
debugger-ruby_core_source (1.3.
|
34
|
+
debugger-ruby_core_source (1.3.8)
|
33
35
|
diff-lcs (1.2.5)
|
34
|
-
domain_name (0.5.
|
36
|
+
domain_name (0.5.24)
|
35
37
|
unf (>= 0.0.5, < 1.0.0)
|
36
38
|
gyoku (1.1.1)
|
37
39
|
builder (>= 2.1.2)
|
38
40
|
httpi (2.1.1)
|
39
41
|
rack
|
40
42
|
rubyntlm (~> 0.3.2)
|
43
|
+
json (1.8.2)
|
41
44
|
mechanize (2.5.1)
|
42
45
|
domain_name (~> 0.5, >= 0.5.1)
|
43
46
|
mime-types (~> 1.17, >= 1.17.2)
|
@@ -53,6 +56,8 @@ GEM
|
|
53
56
|
nokogiri (1.5.11)
|
54
57
|
nori (2.3.0)
|
55
58
|
ntlm-http (0.1.1)
|
59
|
+
parslet (1.7.0)
|
60
|
+
blankslate (>= 2.0, <= 4.0)
|
56
61
|
pry (0.10.1)
|
57
62
|
coderay (~> 1.1.0)
|
58
63
|
method_source (~> 0.8.1)
|
@@ -63,20 +68,22 @@ GEM
|
|
63
68
|
rack (1.6.0)
|
64
69
|
rake (10.4.2)
|
65
70
|
rdoc (4.2.0)
|
66
|
-
|
67
|
-
|
68
|
-
rspec-
|
69
|
-
rspec-
|
70
|
-
|
71
|
-
|
72
|
-
|
71
|
+
json (~> 1.4)
|
72
|
+
rspec (3.2.0)
|
73
|
+
rspec-core (~> 3.2.0)
|
74
|
+
rspec-expectations (~> 3.2.0)
|
75
|
+
rspec-mocks (~> 3.2.0)
|
76
|
+
rspec-core (3.2.3)
|
77
|
+
rspec-support (~> 3.2.0)
|
78
|
+
rspec-expectations (3.2.1)
|
73
79
|
diff-lcs (>= 1.2.0, < 2.0)
|
74
|
-
rspec-support (~> 3.
|
75
|
-
rspec-mocks (3.1
|
76
|
-
|
77
|
-
|
80
|
+
rspec-support (~> 3.2.0)
|
81
|
+
rspec-mocks (3.2.1)
|
82
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
83
|
+
rspec-support (~> 3.2.0)
|
84
|
+
rspec-support (3.2.2)
|
78
85
|
ruby-ole (1.2.11.8)
|
79
|
-
rubyXL (3.3.
|
86
|
+
rubyXL (3.3.8)
|
80
87
|
nokogiri (>= 1.4.4)
|
81
88
|
rubyzip (>= 1.1.6)
|
82
89
|
rubyntlm (0.3.4)
|
@@ -93,16 +100,16 @@ GEM
|
|
93
100
|
sax-machine (0.1.0)
|
94
101
|
nokogiri (> 0.0.0)
|
95
102
|
slop (3.6.0)
|
96
|
-
spreadsheet (1.0.
|
103
|
+
spreadsheet (1.0.3)
|
97
104
|
ruby-ole (>= 1.0)
|
98
105
|
unf (0.1.4)
|
99
106
|
unf_ext
|
100
|
-
unf_ext (0.0.
|
107
|
+
unf_ext (0.0.7.1)
|
101
108
|
wasabi (3.2.3)
|
102
109
|
httpi (~> 2.0)
|
103
110
|
mime-types (< 2.0.0)
|
104
111
|
nokogiri (>= 1.4.0)
|
105
|
-
webmock (1.
|
112
|
+
webmock (1.21.0)
|
106
113
|
addressable (>= 2.3.6)
|
107
114
|
crack (>= 0.3.2)
|
108
115
|
webrobots (0.1.1)
|
data/History.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
=== 2.0.6 / 27.04.2015
|
2
|
+
|
3
|
+
* Remove trailing spaces in names imported from ZurRose transfer.dat
|
4
|
+
* Use ISO-8859-14 when reading ZurRose transfer.dat to avoid unreadable characters, eg. Ethacridin
|
5
|
+
|
1
6
|
=== 2.0.5 / 31.03.2015
|
2
7
|
|
3
8
|
* --calc adds boolean flag is_active_agent
|
@@ -0,0 +1,50 @@
|
|
1
|
+
h1. Dokumentation für oddb2xml --calc
|
2
|
+
|
3
|
+
Wenn man oddb2xml mit der Option @--calc@ aufruft, wird eine Datei @oddb_calc.xml@ erstellt, welche aufgrund diverser offener Datenquellen, (u.a. "Excel-Version Zugelassene Verpackungen":https://www.swissmedic.ch/arzneimittel/00156/00221/00222/00230/index.html?lang=de ) die Zusammensetzung aller in der Schweiz öffentlich zugelassener Medikamente im XML-Format erstellt.
|
4
|
+
|
5
|
+
h2. Ziel
|
6
|
+
|
7
|
+
Die Swissmedic verpackt in der Excel-Datei für die Packungsbsbeschreibung in den Spalte Q @Zusammensetzung@, eine Menge an Informationen, aus welchen Bestandteilen ein Medikament besteht. Zu einem grossen Teil folgt dies einer Syntax, für welche keine öffentlich zugängliche Dokumention zu bestehen scheint. Deshalb wurde für das Projekt ODDB.org anfangs 2015 beschlossen, die Analyse dieses Feldes von einem auf (was SW-Ingenieure) regulären Ausdrücken basierenden Algorithum auf einen echten Parser umzustellen.
|
8
|
+
|
9
|
+
In diesem Dokument versuchen wir die dabei erkannten Elemente kurz zu dokumentieren und Grenzfälle und Entscheide anhand von Beispielen (mit IKSNR und Name identifiert) zu begründen.
|
10
|
+
|
11
|
+
|
12
|
+
h2. Syntax
|
13
|
+
|
14
|
+
Die Syntax wird in einer Datei "compositions_syntax":https://raw.githubusercontent.com/ngiger/oddb2xml/master/lib/oddb2xml/compositions_syntax.rb beschrieben.
|
15
|
+
|
16
|
+
* ratio: Falls eine Zeile in der Zusammensetzung sow was wie @ratio: 1:10@ oder @ratio: 1:1.5-2.4@ enthält, wird das Feld "more_info" entsprechend gesetzt.
|
17
|
+
|
18
|
+
* Namen mit Zahlen. Beschloss dass @Glyceroli Monostearas 33-45@ eine Name und nicht Glyceroli Monostearas von 33 bis 45 von irgendwas ist.
|
19
|
+
|
20
|
+
h2. Gebrauchte Abkürzungen und Schlüsselworte
|
21
|
+
|
22
|
+
* q.s.
|
23
|
+
* pro praeparatione
|
24
|
+
* excipiens
|
25
|
+
* pro compresso obducto
|
26
|
+
* pro compresso
|
27
|
+
* epro praeparatione
|
28
|
+
* ad pulverem
|
29
|
+
* pro charta
|
30
|
+
* ad globulos
|
31
|
+
* aqua ad iniectabilia q.s. ad solutionem
|
32
|
+
* solvens (i.v.): aqua ad iniectabilia
|
33
|
+
* ad solutionem
|
34
|
+
* q.s. ad
|
35
|
+
* aqua q.s. ad
|
36
|
+
* saccharum ad
|
37
|
+
* aether q.s.
|
38
|
+
* aqua ad iniectabilia
|
39
|
+
* q.s. pro praeparatione
|
40
|
+
* ana partes
|
41
|
+
* et oder @,@ trennen Substanzen
|
42
|
+
* ut vorgehende Substanz wird via nachstehend aufgeführte Salze(e) aufgenommen
|
43
|
+
|
44
|
+
|
45
|
+
h2. Vorgeschlag für Bereinigung
|
46
|
+
|
47
|
+
* SwissmedicErrorHandler
|
48
|
+
* Corresp: als Label für Bestandteile
|
49
|
+
* corresp. für zugehörende Substanz
|
50
|
+
* @<@ durch @max.@ ersetzen (1 Mal)
|
data/lib/oddb2xml/builder.rb
CHANGED
@@ -647,6 +647,10 @@ module Oddb2xml
|
|
647
647
|
next unless row and row.cells[0] and row.cells[0].value and row.cells[0].value.to_i > 0
|
648
648
|
iksnr = "%05i" % row.cells[0].value.to_i
|
649
649
|
seqnr = "%02d" % row.cells[1].value.to_i
|
650
|
+
if row_nr % 250 == 0
|
651
|
+
puts "#{Time.now}: At row #{row_nr} iksnr #{iksnr}";
|
652
|
+
$stdout.sync
|
653
|
+
end
|
650
654
|
no8 = sprintf('%05d',row.cells[0].value.to_i) + sprintf('%03d',row.cells[10].value.to_i)
|
651
655
|
name = row.cells[2].value
|
652
656
|
atc_code = row.cells[5] ? row.cells[5].value : nil
|
@@ -680,12 +684,13 @@ module Oddb2xml
|
|
680
684
|
xml.COMPOSITIONS {
|
681
685
|
info.compositions.each { |composition|
|
682
686
|
xml.COMPOSITION {
|
683
|
-
|
687
|
+
xml.CORRESP composition.corresp if composition.corresp
|
684
688
|
xml.LABEL composition.label if composition.label
|
685
689
|
xml.LABEL_DESCRIPTION composition.label_description if composition.label_description
|
686
690
|
xml.SUBSTANCES {
|
687
691
|
composition.substances.each { |substance|
|
688
692
|
xml.SUBSTANCE {
|
693
|
+
xml.MORE_INFO substance.more_info.gsub('>','>').gsub('&', '&') if substance.more_info
|
689
694
|
xml.SUBSTANCE_NAME substance.name
|
690
695
|
xml.IS_ACTIVE_AGENT substance.is_active_agent
|
691
696
|
if substance.unit
|
@@ -693,9 +698,9 @@ module Oddb2xml
|
|
693
698
|
xml.UNIT substance.unit
|
694
699
|
end
|
695
700
|
if substance.chemical_substance
|
696
|
-
xml.CHEMICAL_SUBSTANCE substance.chemical_substance
|
697
|
-
xml.CHEMICAL_QTY
|
698
|
-
xml.CHEMICAL_UNIT
|
701
|
+
xml.CHEMICAL_SUBSTANCE substance.chemical_substance.name
|
702
|
+
xml.CHEMICAL_QTY substance.chemical_substance.qty
|
703
|
+
xml.CHEMICAL_UNIT substance.chemical_substance.unit
|
699
704
|
end
|
700
705
|
}
|
701
706
|
}
|
data/lib/oddb2xml/calc.rb
CHANGED
data/lib/oddb2xml/cli.rb
CHANGED
@@ -0,0 +1,368 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# This file is shared since oddb2xml 2.0.0 (lib/oddb2xml/parse_compositions.rb)
|
4
|
+
# with oddb.org src/plugin/parse_compositions.rb
|
5
|
+
#
|
6
|
+
# It allows an easy parsing of the column P Zusammensetzung of the swissmedic packages.xlsx file
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'parslet'
|
10
|
+
require 'parslet/convenience'
|
11
|
+
include Parslet
|
12
|
+
|
13
|
+
class CompositionParser < Parslet::Parser
|
14
|
+
|
15
|
+
# Single character rules
|
16
|
+
rule(:lparen) { str('(') }
|
17
|
+
rule(:rparen) { str(')') }
|
18
|
+
rule(:comma) { str(',') }
|
19
|
+
|
20
|
+
rule(:space) { match('\s').repeat(1) }
|
21
|
+
rule(:space?) { space.maybe }
|
22
|
+
|
23
|
+
# Things
|
24
|
+
rule(:digit) { match('[0-9]') }
|
25
|
+
rule(:digits) { digit.repeat(1) }
|
26
|
+
rule(:number) {
|
27
|
+
(
|
28
|
+
str('-').maybe >> (
|
29
|
+
str('0') | (match('[1-9]') >> match('[0-9\']').repeat)
|
30
|
+
) >> (
|
31
|
+
(str('*') >> digit.repeat(1)).maybe >>
|
32
|
+
(match(['.,^']) >> digit.repeat(1)).repeat(1)
|
33
|
+
).maybe >> (
|
34
|
+
match('[eE]') >> (str('+') | str('-')).maybe >> digit.repeat(1)
|
35
|
+
).maybe
|
36
|
+
)
|
37
|
+
}
|
38
|
+
rule(:radio_isotop) { match['a-zA-Z'].repeat(1) >> lparen >> digits >> str('-') >> match['a-zA-Z'].repeat(1-3) >> rparen >>
|
39
|
+
((space? >> match['a-zA-Z']).repeat(1)).repeat(0)
|
40
|
+
} # e.g. Xenonum (133-Xe) or yttrii(90-Y) chloridum zum Kalibrierungszeitpunkt
|
41
|
+
rule(:ratio_value) { match['0-9:\-\.'].repeat(1) >> space?} # eg. ratio: 1:1, ratio: 1:1.5-2.4., ratio: 1:0.68-0.95
|
42
|
+
|
43
|
+
# handle stuff like acidum 9,11-linolicum or 2,2'-methylen-bis(6-tert.-butyl-4-methyl-phenolum) specially. it must contain at least one a-z
|
44
|
+
rule(:umlaut) { match(['éàèèçïöäüâ']) }
|
45
|
+
rule(:identifier_D12) { match['a-zA-Z'] >> match['0-9'].repeat(1) }
|
46
|
+
rule(:identifier) { str('A + B') | str('ethanol.') | str('poloxamerum 238') | str('TM:') | str('&') | # TODO: why do we have to hard code these identifiers?
|
47
|
+
str('F.E.I.B.A.') | str('LA 25% TM') | str('50/50') | str('polysorbatum ') >> digit >> digit | str('q.s.') |
|
48
|
+
digit >> digit.maybe >> space >> str('per centum ') >> str('q.s.').maybe| str('1g/9.6 cm²') |
|
49
|
+
str('9 g/L 5.4 ml') |
|
50
|
+
str('spag.') | str('spp.') | str('ssp.') | str('deklar.') | # TODO: Sind diese Abkürzung wirklich Teil eines Substanznamens?
|
51
|
+
str('ca.') | str('var.') | str('spec.') |
|
52
|
+
identifier_D12 | identifier_without_comma | identifier_with_comma
|
53
|
+
}
|
54
|
+
|
55
|
+
rule(:identifier_with_comma) {
|
56
|
+
match['0-9,\-'].repeat(0) >> (match['a-zA-Z']|umlaut) >> (match(['_,']).maybe >> (match['0-9a-zA-Z\-\'\/'] | umlaut)).repeat(0)
|
57
|
+
}
|
58
|
+
|
59
|
+
rule(:identifier_without_comma) {
|
60
|
+
match['0-9\',\-'].repeat(0) >> (match['a-zA-Z']|umlaut) >> (match(['_']).maybe >> (match['0-9a-zA-Z\-\'\/'] | umlaut)).repeat(0) >>
|
61
|
+
lparen >> (rparen.absent? >> any).repeat(1) >> rparen
|
62
|
+
}
|
63
|
+
rule(:one_word) { match['a-zA-Z'] >> match['0-9'].repeat(1) | match['a-zA-Z'].repeat(1) }
|
64
|
+
rule(:in_parent) { lparen >> one_word.repeat(1) >> rparen }
|
65
|
+
rule(:words_nested) { one_word.repeat(1) >> in_parent.maybe >> space? >> one_word.repeat(0) }
|
66
|
+
# dose
|
67
|
+
# 150 U.I. hFSH et 150 U.I. hLH
|
68
|
+
rule(:dose_unit) { (str('cm²') |
|
69
|
+
str('g/dm²') |
|
70
|
+
str('g/l') |
|
71
|
+
str('g/L') |
|
72
|
+
str('% V/V') |
|
73
|
+
str('µg/24 h') |
|
74
|
+
str('µg/g') |
|
75
|
+
str('µg') |
|
76
|
+
str('ng') |
|
77
|
+
str('guttae') |
|
78
|
+
str('mg/g') |
|
79
|
+
str('mg/ml') |
|
80
|
+
str('MBq/ml') |
|
81
|
+
str('MBq') |
|
82
|
+
str('CFU') |
|
83
|
+
str('mg') |
|
84
|
+
str('Mg') |
|
85
|
+
str('kJ') |
|
86
|
+
str('G') |
|
87
|
+
str('g') |
|
88
|
+
str('l') |
|
89
|
+
str('µl') |
|
90
|
+
str('U. Ph. Eur.') |
|
91
|
+
str('ml') |
|
92
|
+
str('µmol') |
|
93
|
+
str('mmol/l') |
|
94
|
+
str('mmol') |
|
95
|
+
str('Mio CFU') |
|
96
|
+
str('Mio U.I.') |
|
97
|
+
str('Mio U.') |
|
98
|
+
str('Mio. U.I.') |
|
99
|
+
str('Mio. U.') |
|
100
|
+
str('Mia. U.I.') |
|
101
|
+
str('Mia. U.') |
|
102
|
+
str('U. Botox,') | # TODO: Should be U. Botox
|
103
|
+
str('U.I. hFSH') |
|
104
|
+
str('U.I. hCG') |
|
105
|
+
str('U.I. hLH') |
|
106
|
+
str('U.I.') |
|
107
|
+
str('U./ml') |
|
108
|
+
str('U.') |
|
109
|
+
str('Mia.') |
|
110
|
+
str('Mrd.') |
|
111
|
+
str('% m/m') |
|
112
|
+
str('% m/m') |
|
113
|
+
str('%')
|
114
|
+
).as(:unit) }
|
115
|
+
rule(:qty_range) { (number >> space? >> (str('+/-') | str(' - ') | str(' -') | str('-') | str('±') ) >> space? >> number).as(:qty_range) }
|
116
|
+
rule(:qty_unit) { dose_qty >> (space >> dose_unit).maybe }
|
117
|
+
rule(:dose_qty) { number.as(:qty) }
|
118
|
+
rule(:min_max) { str('mind.') | (str('min.') | str('max.') | str('ca.') | str('<') ) >> space? } # TODO: swissmedic should replace mind. -> min.
|
119
|
+
# 75 U.I. hFSH et 75 U.I. hLH
|
120
|
+
rule(:dose_fsh) { qty_unit >> space >> str('et') >> space >> qty_unit.as(:dose_right) }
|
121
|
+
rule(:dose_per) { (digits >> str('/') >> digits).as(:qty)}
|
122
|
+
rule(:dose) { dose_fsh |
|
123
|
+
dose_per |
|
124
|
+
( min_max.maybe >>
|
125
|
+
( (qty_range >> (space >> dose_unit).maybe) | (qty_unit | dose_qty |dose_unit)) >> space? )
|
126
|
+
}
|
127
|
+
rule(:dose_with_unit) { min_max.maybe >>
|
128
|
+
dose_fsh |
|
129
|
+
( qty_range >> space >> dose_unit |
|
130
|
+
dose_qty >> space >> dose_unit
|
131
|
+
) >>
|
132
|
+
space?
|
133
|
+
}
|
134
|
+
rule(:operator) { match('[+]') >> space? }
|
135
|
+
|
136
|
+
# Grammar parts
|
137
|
+
rule(:useage) { (any >> str('berzug:')) | # match Überzug
|
138
|
+
str('antiox.:') |
|
139
|
+
str('arom.:') |
|
140
|
+
str('conserv.:') |
|
141
|
+
str('color.:')
|
142
|
+
}
|
143
|
+
rule(:lebensmittel_zusatz) { str('E').as(:lebensmittel_zusatz) >> space >>
|
144
|
+
(digits >> match['(a-z)'].repeat(0,3)).as(:digits) >>
|
145
|
+
(space >> dose.as(:dose_lebensmittel_zusatz)).maybe >> space?
|
146
|
+
|
147
|
+
} # Match Wirkstoffe like E 270
|
148
|
+
rule(:der) { (str('DER:') >> space >> digit >> match['0-9\.\-:'].repeat).as(:der) >> space?
|
149
|
+
} # DER: 1:4 or DER: 3.5:1 or DER: 6-8:1 or DER: 4.0-9.0:1'
|
150
|
+
rule(:forbidden_in_substance_name) {
|
151
|
+
useage |
|
152
|
+
min_max |
|
153
|
+
str('corresp. ca.,') |
|
154
|
+
str(', corresp.') |
|
155
|
+
str('corresp.') |
|
156
|
+
str('ratio:') |
|
157
|
+
str('Mio ') |
|
158
|
+
str('et ') |
|
159
|
+
str('ut ') |
|
160
|
+
str('Beutel: ') |
|
161
|
+
str('ut alia: ') |
|
162
|
+
str('pro dosi') |
|
163
|
+
str('pro capsula') |
|
164
|
+
str('pro vitroe') |
|
165
|
+
(digits.repeat(1) >> space >> str(':')) | # match 50 %
|
166
|
+
str('ad globulos') |
|
167
|
+
str('ana ') |
|
168
|
+
str('ana partes') |
|
169
|
+
str('partes') |
|
170
|
+
str('ad pulverem') |
|
171
|
+
str('ad suspensionem') |
|
172
|
+
str('q.s. ad ') |
|
173
|
+
str('q.s. pro ') |
|
174
|
+
str('ad solutionem') |
|
175
|
+
str('ad emulsionem') |
|
176
|
+
str('excipiens')
|
177
|
+
}
|
178
|
+
rule(:name_without_parenthesis) {
|
179
|
+
(
|
180
|
+
(str('(') | forbidden_in_substance_name).absent? >>
|
181
|
+
(radio_isotop | str('> 1000') | str('> 500') | identifier.repeat(1)) >>
|
182
|
+
space?
|
183
|
+
).repeat(1)
|
184
|
+
}
|
185
|
+
|
186
|
+
rule(:part_with_parenthesis) { lparen >> ( (lparen | rparen).absent? >> any).repeat(1) >>
|
187
|
+
(part_with_parenthesis | rparen >> str('-like:') | rparen ) >> space?
|
188
|
+
}
|
189
|
+
rule(:name_with_parenthesis) {
|
190
|
+
forbidden_in_substance_name.absent? >>
|
191
|
+
((comma | lparen).absent? >> any).repeat(0) >> part_with_parenthesis >>
|
192
|
+
(forbidden_in_substance_name.absent? >> (identifier.repeat(1) | part_with_parenthesis | rparen) >> space?).repeat(0)
|
193
|
+
}
|
194
|
+
rule(:substance_name) { (
|
195
|
+
der |
|
196
|
+
name_with_parenthesis |
|
197
|
+
name_without_parenthesis
|
198
|
+
) >>
|
199
|
+
str('pro dosi').maybe >> space?
|
200
|
+
}
|
201
|
+
rule(:simple_substance) { substance_name.as(:substance_name) >> space? >> dose.as(:dose).maybe}
|
202
|
+
rule(:simple_subtance_with_digits_in_name_and_dose) {
|
203
|
+
substance_lead.maybe >> space? >>
|
204
|
+
(name_without_parenthesis >> space? >> ((digits.repeat(1) >> (str(' %') | str('%')) | digits.repeat(1)))).as(:substance_name) >>
|
205
|
+
space >> dose_with_unit.as(:dose)
|
206
|
+
}
|
207
|
+
|
208
|
+
|
209
|
+
rule(:pro_dose) { str('pro') >> space >> dose.as(:dose_corresp) }
|
210
|
+
|
211
|
+
# TODO: what does ut alia: impl?
|
212
|
+
rule(:substance_ut) {
|
213
|
+
(substance_lead.maybe >> simple_substance).as(:substance_ut) >>
|
214
|
+
(space? >> (str('pro dosi ut ') | str('ut ') ) >>
|
215
|
+
space? >> str('alia:').absent? >>
|
216
|
+
(excipiens |
|
217
|
+
substance_name >> space? >> str('corresp.') >> space? >> substance_lead.maybe >> space? >> simple_substance |
|
218
|
+
simple_substance
|
219
|
+
).as(:for_ut)
|
220
|
+
).repeat(1) >>
|
221
|
+
space? # >> str('alia:').maybe >> space?
|
222
|
+
}
|
223
|
+
|
224
|
+
rule(:substance_more_info) { # e.g. "acari allergeni extractum 5000 U.:
|
225
|
+
(str('ratio:').absent? >> (identifier|digits) >> space?).repeat(1).as(:more_info) >> space? >> (str('U.:') | str(':')| str('.:')) >> space?
|
226
|
+
}
|
227
|
+
|
228
|
+
rule(:dose_pro) { (
|
229
|
+
str('excipiens ad solutionem pro ') |
|
230
|
+
str('aqua q.s. ad gelatume pro ') |
|
231
|
+
str('aqua q.s. ad solutionem pro ') |
|
232
|
+
str('aqua q.s. ad suspensionem pro ') |
|
233
|
+
str('q.s. ad pulverem pro ') |
|
234
|
+
str('doses pro vase ') |
|
235
|
+
str('pro vase ') |
|
236
|
+
str('excipiens ad emulsionem pro ') |
|
237
|
+
str('excipiens ad pulverem pro ') |
|
238
|
+
str('aqua ad iniectabilia q.s. ad solutionem pro ')
|
239
|
+
) >> dose.as(:dose_pro) >> space? >> ratio.as(:ratio).maybe
|
240
|
+
}
|
241
|
+
|
242
|
+
rule(:excipiens) { (dose_pro |
|
243
|
+
str('excipiens pro compresso obducto') |
|
244
|
+
str('excipiens pro compresso') |
|
245
|
+
str('excipiens pro praeparatione') |
|
246
|
+
str('excipiens') |
|
247
|
+
str('ad pulverem') |
|
248
|
+
str('pro charta') |
|
249
|
+
str('ad globulos') |
|
250
|
+
str('aqua ad iniectabilia q.s. ad solutionem') |
|
251
|
+
str('solvens (i.v.): aqua ad iniectabilia') |
|
252
|
+
str('ad solutionem') |
|
253
|
+
str('q.s. ad') |
|
254
|
+
str('aqua q.s. ad') |
|
255
|
+
str('saccharum ad') |
|
256
|
+
str('aether q.s.') |
|
257
|
+
str('pro vitro') |
|
258
|
+
str('aqua ad iniectabilia') |
|
259
|
+
str('pro praeparatione') |
|
260
|
+
str('q.s. pro praeparatione') |
|
261
|
+
str('ana partes')
|
262
|
+
) >> space? >>
|
263
|
+
( any.repeat(0) )
|
264
|
+
}
|
265
|
+
|
266
|
+
rule(:substance_lead) { useage.as(:more_info) >> space? |
|
267
|
+
str('Beutel:').as(:more_info) >> space? |
|
268
|
+
str('residui:').as(:more_info) >> space? |
|
269
|
+
str('mineralia').as(:mineralia) >> str(':') >> space? |
|
270
|
+
str('Solvens:').as(:solvens) >> space? |
|
271
|
+
substance_more_info
|
272
|
+
}
|
273
|
+
rule(:corresp_substance_label) {
|
274
|
+
str(', corresp. ca.,') |
|
275
|
+
str('corresp. ca.,') |
|
276
|
+
str('corresp.') |
|
277
|
+
str('corresp., ') |
|
278
|
+
str(', corresp.')
|
279
|
+
}
|
280
|
+
|
281
|
+
rule(:corresp_substance) {
|
282
|
+
(corresp_substance_label) >> space? >>
|
283
|
+
(
|
284
|
+
simple_substance.as(:substance_corresp) |
|
285
|
+
dose.as(:dose_corresp_2)
|
286
|
+
)
|
287
|
+
}
|
288
|
+
|
289
|
+
rule(:ratio) { str('ratio:') >> space >> ratio_value }
|
290
|
+
|
291
|
+
rule(:solvens) { (str('Solvens:') | str('Solvens (i.m.):'))>> space >> (any.repeat).as(:solvens) >> space? >>
|
292
|
+
(substance.as(:substance) >> str('/L').maybe).maybe >>
|
293
|
+
any.maybe
|
294
|
+
}
|
295
|
+
rule(:substance) {
|
296
|
+
simple_subtance_with_digits_in_name_and_dose |
|
297
|
+
useage.as(:more_info) >> space? >> excipiens |
|
298
|
+
ratio.as(:ratio) |
|
299
|
+
solvens |
|
300
|
+
der >> corresp_substance.maybe |
|
301
|
+
(str('potenziert mit:') >> space).maybe >> excipiens.as(:excipiens) |
|
302
|
+
substance_ut |
|
303
|
+
substance_lead.maybe >> space? >> lebensmittel_zusatz |
|
304
|
+
substance_lead.maybe >> space? >> simple_substance >> corresp_substance.maybe >> space? >> corresp_substance.maybe >> space? >> dose_pro.maybe >> str('pro dosi').maybe
|
305
|
+
}
|
306
|
+
rule(:histamin) { str('U = Histamin Equivalent Prick').as(:histamin) }
|
307
|
+
rule(:praeparatio){ ((one_word >> space?).repeat(1).as(:description) >> str(':') >> space?).maybe >>
|
308
|
+
(name_with_parenthesis | name_without_parenthesis).repeat(1).as(:substance_name) >>
|
309
|
+
number.as(:qty) >> space >> str('U.:') >> space? >>
|
310
|
+
((identifier >> space?).repeat(1).as(:more_info) >> space?).maybe
|
311
|
+
}
|
312
|
+
rule(:substance_separator) { (str(', et ') | comma | str('et ') | str('ut alia: ')) >> space? }
|
313
|
+
rule(:one_substance) { (praeparatio | histamin | substance).as(:substance) >> space? >> ratio.as(:ratio).maybe }
|
314
|
+
# rule(:one_substance) { (substance_ut).as(:substance) } # >> str('.').maybe }
|
315
|
+
rule(:all_substances) { (one_substance >> substance_separator.maybe).repeat(1) }
|
316
|
+
rule(:composition) { all_substances }
|
317
|
+
rule(:long_labels) {
|
318
|
+
str('Praeparatio sicca cum solvens: praeparatio sicca:') |
|
319
|
+
str('Praeparatio cryodesiccata') >> (str(':').absent? >> any).repeat(0) >> str(':') |
|
320
|
+
str('Tela cum praeparatione (Panel ') >> digit >> str('):')
|
321
|
+
}
|
322
|
+
rule(:label_id) {
|
323
|
+
(
|
324
|
+
str('V') |
|
325
|
+
str('IV') |
|
326
|
+
str('III') |
|
327
|
+
str('II') |
|
328
|
+
str('I') |
|
329
|
+
str('A') |
|
330
|
+
str('B') |
|
331
|
+
str('C') |
|
332
|
+
str('D') |
|
333
|
+
str('E')
|
334
|
+
)
|
335
|
+
}
|
336
|
+
rule(:label_separator) { (str('):') | str(')')) }
|
337
|
+
rule(:label) { label_id.as(:label) >> space? >>
|
338
|
+
label_separator >> str(',').absent? >>
|
339
|
+
(space? >> (match(/[^:]/).repeat(0)).as(:label_description) >> str(':') >> space).maybe
|
340
|
+
}
|
341
|
+
rule(:leading_label) { label_id >> label_separator >> (str(' et ') | str(', ') | str(' pro usu: ') | space) >>
|
342
|
+
label_id >> label_separator >> any.repeat(1) |
|
343
|
+
long_labels.as(:label) |
|
344
|
+
label
|
345
|
+
}
|
346
|
+
rule(:corresp_label) {
|
347
|
+
str('doses ') |
|
348
|
+
str('Pulver: ') |
|
349
|
+
str('Diluens: ') |
|
350
|
+
str('Solvens (i.v.): ') |
|
351
|
+
str('Solvens (i.m.): ') |
|
352
|
+
str('Solvens: ') |
|
353
|
+
str('Solutio reconstituta:') |
|
354
|
+
str('Corresp., ') |
|
355
|
+
str('Corresp. ') |
|
356
|
+
str('corresp. ')
|
357
|
+
}
|
358
|
+
rule(:corresp_line) { corresp_label >> any.repeat(1).as(:corresp) |
|
359
|
+
((label_id >> label_separator >> space? >> str('et ').maybe).repeat(1) >> any.repeat(1)).as(:corresp)
|
360
|
+
}
|
361
|
+
|
362
|
+
rule(:expression_comp) {
|
363
|
+
leading_label.maybe >> space? >> composition.as(:composition) >> space? >> str('.').maybe >> space? |
|
364
|
+
corresp_line
|
365
|
+
}
|
366
|
+
root :expression_comp
|
367
|
+
end
|
368
|
+
|
data/lib/oddb2xml/extractor.rb
CHANGED
@@ -456,16 +456,25 @@ module Oddb2xml
|
|
456
456
|
# see http://dev.ywesee.com/Bbmb/TransferDat
|
457
457
|
def initialize(dat, extended = false)
|
458
458
|
@@extended = extended
|
459
|
-
@@error_file ||= File.open(File.join(WorkDir, "duplicate_ean13_from_zur_rose.txt"), '
|
459
|
+
@@error_file ||= File.open(File.join(WorkDir, "duplicate_ean13_from_zur_rose.txt"), 'wb+:ISO-8859-14')
|
460
460
|
@@items_without_ean13s ||= 0
|
461
461
|
@@duplicated_ean13s ||= 0
|
462
462
|
@@zur_rose_items ||= 0
|
463
|
-
|
463
|
+
if dat
|
464
|
+
if File.exists?(dat)
|
465
|
+
@io = File.open(dat, 'rb:ISO-8859-14')
|
466
|
+
else
|
467
|
+
@io = StringIO.new(dat)
|
468
|
+
end
|
469
|
+
@io
|
470
|
+
else
|
471
|
+
nil
|
472
|
+
end
|
464
473
|
end
|
465
474
|
def to_hash
|
466
475
|
data = {}
|
467
476
|
while line = @io.gets
|
468
|
-
line = line.chomp
|
477
|
+
line = line.encode('utf-8').gsub("\u0089", "‰").gsub("\u0092", '’').gsub("\u0096", '-').chomp
|
469
478
|
next if line =~ /(ad us\.* vet)|(\(vet\))/i
|
470
479
|
if @@extended
|
471
480
|
next unless line =~ /(\d{13})(\d{1})$/
|
@@ -490,7 +499,7 @@ module Oddb2xml
|
|
490
499
|
:line => line.chomp,
|
491
500
|
:ean => ean13,
|
492
501
|
:vat => line[96],
|
493
|
-
:description => line[10..59]
|
502
|
+
:description => line[10..59].sub(/\s+$/, ''),
|
494
503
|
:additional_desc => '',
|
495
504
|
:pharmacode => pharma_code,
|
496
505
|
:price => sprintf("%.2f", line[60,6].gsub(/(\d{2})$/, '.\1').to_f),
|