brand2csv 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/lib/brand2csv.rb +44 -25
- data/lib/brand2csv/version.rb +1 -1
- metadata +1 -1
data/History.txt
CHANGED
data/lib/brand2csv.rb
CHANGED
@@ -42,6 +42,8 @@ module Brand2csv
|
|
42
42
|
"tm_lbl_app_date", # Hinterlegungsdatum
|
43
43
|
]
|
44
44
|
|
45
|
+
MaxZeilen = 5
|
46
|
+
|
45
47
|
attr_accessor :marke
|
46
48
|
|
47
49
|
def initialize(timespan)
|
@@ -188,27 +190,44 @@ module Brand2csv
|
|
188
190
|
@lastResponse = response
|
189
191
|
end
|
190
192
|
|
191
|
-
def parseAddress(nummer,
|
192
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5 = inhaber.split(LineSplit)
|
193
|
+
def parseAddress(nummer, zeilen)
|
193
194
|
ort = nil
|
194
195
|
plz = nil
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{inhaber} nicht parsen" if $VERBOSE
|
196
|
+
|
197
|
+
# Search for plz/address
|
198
|
+
1.upto(zeilen.length-1).each {
|
199
|
+
|cnt|
|
200
|
+
if m = AddressRegexp.match(zeilen[cnt])
|
201
|
+
zeilen[cnt+1] = nil
|
202
|
+
plz = m[1]; ort = m[2]
|
203
|
+
cnt.upto(MaxZeilen-1).each{ |cnt2| zeilen[cnt2] = nil }
|
204
|
+
break
|
205
|
+
end
|
206
|
+
}
|
207
|
+
unless plz
|
208
|
+
puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{zeilen.inpsect} nicht parsen" if $VERBOSE
|
209
209
|
return nil, nil, nil, nil, nil, nil, nil, nil
|
210
210
|
end
|
211
|
-
|
211
|
+
# search for lines with only digits
|
212
|
+
found = false
|
213
|
+
1.upto(zeilen.length-1).each {
|
214
|
+
|cnt|
|
215
|
+
break if zeilen[cnt] == nil
|
216
|
+
if /^\d*$/.match(zeilen[cnt])
|
217
|
+
found = true
|
218
|
+
if zeilen[cnt+1] == nil
|
219
|
+
found = 'before'
|
220
|
+
zeilen[cnt-1] += LineSplit + zeilen[cnt]
|
221
|
+
zeilen.delete_at(cnt)
|
222
|
+
else
|
223
|
+
found = 'after'
|
224
|
+
zeilen[cnt] += LineSplit + zeilen[cnt+1]
|
225
|
+
zeilen.delete_at(cnt+1)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
}
|
229
|
+
puts "found #{found}: #{zeilen.inspect}" if found and $VERBOSE
|
230
|
+
return zeilen[0], zeilen[1], zeilen[2], zeilen[3], zeilen[4], plz, ort
|
212
231
|
end
|
213
232
|
|
214
233
|
def fetchDetails(nummer) # takes a long time!
|
@@ -223,7 +242,7 @@ module Brand2csv
|
|
223
242
|
writeResponse("mechanize/detail_#{nummer}.html", content)
|
224
243
|
doc = Nokogiri::Slop(content)
|
225
244
|
end
|
226
|
-
puts "Bitte um Geduld.
|
245
|
+
puts "Bitte um Geduld. Holte Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
|
227
246
|
path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
|
228
247
|
counter = 0
|
229
248
|
doc.xpath(path_name).each{
|
@@ -232,15 +251,15 @@ module Brand2csv
|
|
232
251
|
counter += 1
|
233
252
|
next unless /^inhaber/i.match(td.text)
|
234
253
|
zeilen = []
|
235
|
-
doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text
|
254
|
+
doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text unless child.text.length == 0 } # avoid adding <br>
|
236
255
|
if info = @errors[nummer]
|
237
|
-
info.inhaber = zeilen.join(
|
238
|
-
info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer,
|
256
|
+
info.inhaber = zeilen.join(" ")
|
257
|
+
info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, zeilen)
|
239
258
|
@results << info
|
240
259
|
else
|
241
260
|
bezeichnung = doc.xpath(path_name)[15]
|
242
|
-
inhaber = zeilen.join(
|
243
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer,
|
261
|
+
inhaber = zeilen.join(" ")
|
262
|
+
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, zeilen)
|
244
263
|
hinterlegungsdatum = doc.xpath(path_name)[7]
|
245
264
|
marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
246
265
|
@results << marke
|
@@ -256,7 +275,7 @@ module Brand2csv
|
|
256
275
|
end
|
257
276
|
nrFailures = 0
|
258
277
|
counter += 1
|
259
|
-
puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'"
|
278
|
+
puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'" if $VERBOSE
|
260
279
|
path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
|
261
280
|
hasNext = false
|
262
281
|
doc.xpath(path_name).each{
|
@@ -277,7 +296,7 @@ module Brand2csv
|
|
277
296
|
if bezeichnung.length == 0
|
278
297
|
bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
|
279
298
|
end
|
280
|
-
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
|
299
|
+
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber.split(LineSplit))
|
281
300
|
if zeile_1
|
282
301
|
@results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
|
283
302
|
zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
data/lib/brand2csv/version.rb
CHANGED