brand2csv 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ === 0.1.4 22.05.2013
2
+
3
+ * Better handling of adresslines like '90 route de Frontenex', 'Via San Salvatore, 2'
4
+
1
5
  === 0.1.3 22.05.2013
2
6
 
3
7
  * Updated Manifest.txt to include bin/brand2csv
@@ -42,6 +42,8 @@ module Brand2csv
42
42
  "tm_lbl_app_date", # Hinterlegungsdatum
43
43
  ]
44
44
 
45
+ MaxZeilen = 5
46
+
45
47
  attr_accessor :marke
46
48
 
47
49
  def initialize(timespan)
@@ -188,27 +190,44 @@ module Brand2csv
188
190
  @lastResponse = response
189
191
  end
190
192
 
191
- def parseAddress(nummer, inhaber)
192
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5 = inhaber.split(LineSplit)
193
+ def parseAddress(nummer, zeilen)
193
194
  ort = nil
194
195
  plz = nil
195
- if m = AddressRegexp.match(zeile_2)
196
- zeile_2 = nil
197
- plz = m[1]; ort = m[2]
198
- elsif m = AddressRegexp.match(zeile_3)
199
- zeile_3 = nil
200
- plz = m[1]; ort = m[2]
201
- elsif m = AddressRegexp.match(zeile_4)
202
- zeile_4 = nil
203
- plz = m[1]; ort = m[2]
204
- elsif m = AddressRegexp.match(zeile_5)
205
- zeile_5 = nil
206
- plz = m[1]; ort = m[2]
207
- else
208
- puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{inhaber} nicht parsen" if $VERBOSE
196
+
197
+ # Search for plz/address
198
+ 1.upto(zeilen.length-1).each {
199
+ |cnt|
200
+ if m = AddressRegexp.match(zeilen[cnt])
201
+ zeilen[cnt+1] = nil
202
+ plz = m[1]; ort = m[2]
203
+ cnt.upto(MaxZeilen-1).each{ |cnt2| zeilen[cnt2] = nil }
204
+ break
205
+ end
206
+ }
207
+ unless plz
208
+ puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{zeilen.inpsect} nicht parsen" if $VERBOSE
209
209
  return nil, nil, nil, nil, nil, nil, nil, nil
210
210
  end
211
- return zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort
211
+ # search for lines with only digits
212
+ found = false
213
+ 1.upto(zeilen.length-1).each {
214
+ |cnt|
215
+ break if zeilen[cnt] == nil
216
+ if /^\d*$/.match(zeilen[cnt])
217
+ found = true
218
+ if zeilen[cnt+1] == nil
219
+ found = 'before'
220
+ zeilen[cnt-1] += LineSplit + zeilen[cnt]
221
+ zeilen.delete_at(cnt)
222
+ else
223
+ found = 'after'
224
+ zeilen[cnt] += LineSplit + zeilen[cnt+1]
225
+ zeilen.delete_at(cnt+1)
226
+ end
227
+ end
228
+ }
229
+ puts "found #{found}: #{zeilen.inspect}" if found and $VERBOSE
230
+ return zeilen[0], zeilen[1], zeilen[2], zeilen[3], zeilen[4], plz, ort
212
231
  end
213
232
 
214
233
  def fetchDetails(nummer) # takes a long time!
@@ -223,7 +242,7 @@ module Brand2csv
223
242
  writeResponse("mechanize/detail_#{nummer}.html", content)
224
243
  doc = Nokogiri::Slop(content)
225
244
  end
226
- puts "Bitte um Geduld. Hole Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
245
+ puts "Bitte um Geduld. Holte Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
227
246
  path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
228
247
  counter = 0
229
248
  doc.xpath(path_name).each{
@@ -232,15 +251,15 @@ module Brand2csv
232
251
  counter += 1
233
252
  next unless /^inhaber/i.match(td.text)
234
253
  zeilen = []
235
- doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text.gsub(LineSplit,'. ') unless child.text.length == 0 } # avoid adding <br>
254
+ doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text unless child.text.length == 0 } # avoid adding <br>
236
255
  if info = @errors[nummer]
237
- info.inhaber = zeilen.join(LineSplit)
238
- info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, info.inhaber)
256
+ info.inhaber = zeilen.join(" ")
257
+ info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, zeilen)
239
258
  @results << info
240
259
  else
241
260
  bezeichnung = doc.xpath(path_name)[15]
242
- inhaber = zeilen.join(LineSplit)
243
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
261
+ inhaber = zeilen.join(" ")
262
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, zeilen)
244
263
  hinterlegungsdatum = doc.xpath(path_name)[7]
245
264
  marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
246
265
  @results << marke
@@ -256,7 +275,7 @@ module Brand2csv
256
275
  end
257
276
  nrFailures = 0
258
277
  counter += 1
259
- puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'"
278
+ puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'" if $VERBOSE
260
279
  path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
261
280
  hasNext = false
262
281
  doc.xpath(path_name).each{
@@ -277,7 +296,7 @@ module Brand2csv
277
296
  if bezeichnung.length == 0
278
297
  bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
279
298
  end
280
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
299
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber.split(LineSplit))
281
300
  if zeile_1
282
301
  @results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
283
302
  zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
@@ -1,3 +1,3 @@
1
1
  module Brand2csv
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brand2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: