brand2csv 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/lib/brand2csv.rb +44 -25
- data/lib/brand2csv/version.rb +1 -1
- metadata +1 -1
data/History.txt
CHANGED
data/lib/brand2csv.rb
CHANGED
@@ -42,6 +42,8 @@ module Brand2csv
|
|
42
42
|
"tm_lbl_app_date", # Hinterlegungsdatum
|
43
43
|
]
|
44
44
|
|
45
|
+
MaxZeilen = 5
|
46
|
+
|
45
47
|
attr_accessor :marke
|
46
48
|
|
47
49
|
def initialize(timespan)
|
@@ -188,27 +190,44 @@ module Brand2csv
|
|
188
190
|
@lastResponse = response
|
189
191
|
end
|
190
192
|
|
191
|
-
def parseAddress(nummer,
|
192
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5 = inhaber.split(LineSplit)
|
193
|
+
def parseAddress(nummer, zeilen)
|
193
194
|
ort = nil
|
194
195
|
plz = nil
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{inhaber} nicht parsen" if $VERBOSE
|
196
|
+
|
197
|
+
# Search for plz/address
|
198
|
+
1.upto(zeilen.length-1).each {
|
199
|
+
|cnt|
|
200
|
+
if m = AddressRegexp.match(zeilen[cnt])
|
201
|
+
zeilen[cnt+1] = nil
|
202
|
+
plz = m[1]; ort = m[2]
|
203
|
+
cnt.upto(MaxZeilen-1).each{ |cnt2| zeilen[cnt2] = nil }
|
204
|
+
break
|
205
|
+
end
|
206
|
+
}
|
207
|
+
unless plz
|
208
|
+
puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{zeilen.inpsect} nicht parsen" if $VERBOSE
|
209
209
|
return nil, nil, nil, nil, nil, nil, nil, nil
|
210
210
|
end
|
211
|
-
|
211
|
+
# search for lines with only digits
|
212
|
+
found = false
|
213
|
+
1.upto(zeilen.length-1).each {
|
214
|
+
|cnt|
|
215
|
+
break if zeilen[cnt] == nil
|
216
|
+
if /^\d*$/.match(zeilen[cnt])
|
217
|
+
found = true
|
218
|
+
if zeilen[cnt+1] == nil
|
219
|
+
found = 'before'
|
220
|
+
zeilen[cnt-1] += LineSplit + zeilen[cnt]
|
221
|
+
zeilen.delete_at(cnt)
|
222
|
+
else
|
223
|
+
found = 'after'
|
224
|
+
zeilen[cnt] += LineSplit + zeilen[cnt+1]
|
225
|
+
zeilen.delete_at(cnt+1)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
}
|
229
|
+
puts "found #{found}: #{zeilen.inspect}" if found and $VERBOSE
|
230
|
+
return zeilen[0], zeilen[1], zeilen[2], zeilen[3], zeilen[4], plz, ort
|
212
231
|
end
|
213
232
|
|
214
233
|
def fetchDetails(nummer) # takes a long time!
|
@@ -223,7 +242,7 @@ module Brand2csv
|
|
223
242
|
writeResponse("mechanize/detail_#{nummer}.html", content)
|
224
243
|
doc = Nokogiri::Slop(content)
|
225
244
|
end
|
226
|
-
puts "Bitte um Geduld.
|
245
|
+
puts "Bitte um Geduld. Holte Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
|
227
246
|
path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
|
228
247
|
counter = 0
|
229
248
|
doc.xpath(path_name).each{
|
@@ -232,15 +251,15 @@ module Brand2csv
|
|
232
251
|
counter += 1
|
233
252
|
next unless /^inhaber/i.match(td.text)
|
234
253
|
zeilen = []
|
235
|
-
doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text
|
254
|
+
doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text unless child.text.length == 0 } # avoid adding <br>
|
236
255
|
if info = @errors[nummer]
|
237
|
-
info.inhaber = zeilen.join(
|
238
|
-
info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer,
|
256
|
+
info.inhaber = zeilen.join(" ")
|
257
|
+
info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, zeilen)
|
239
258
|
@results << info
|
240
259
|
else
|
241
260
|
bezeichnung = doc.xpath(path_name)[15]
|
242
|
-
inhaber = zeilen.join(
|
243
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer,
|
261
|
+
inhaber = zeilen.join(" ")
|
262
|
+
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, zeilen)
|
244
263
|
hinterlegungsdatum = doc.xpath(path_name)[7]
|
245
264
|
marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
246
265
|
@results << marke
|
@@ -256,7 +275,7 @@ module Brand2csv
|
|
256
275
|
end
|
257
276
|
nrFailures = 0
|
258
277
|
counter += 1
|
259
|
-
puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'"
|
278
|
+
puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'" if $VERBOSE
|
260
279
|
path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
|
261
280
|
hasNext = false
|
262
281
|
doc.xpath(path_name).each{
|
@@ -277,7 +296,7 @@ module Brand2csv
|
|
277
296
|
if bezeichnung.length == 0
|
278
297
|
bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
|
279
298
|
end
|
280
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
|
299
|
+
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber.split(LineSplit))
|
281
300
|
if zeile_1
|
282
301
|
@results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
|
283
302
|
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
data/lib/brand2csv/version.rb
CHANGED