brand2csv 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gemtest ADDED
File without changes
data/.gitignore CHANGED
@@ -1,3 +1,4 @@
1
1
  *~
2
2
  *#
3
3
  ausgabe.csv
4
+ mechanize/
data/History.txt CHANGED
@@ -1,3 +1,10 @@
1
+ === 0.1.6 27.05.2013
2
+
3
+ * Added support for parsing results up to 10'000 hits
4
+ * Added rspec
5
+ * Added (undocumented) second parameter to limit according the trademark name
6
+ * Added .travis.yml to enable running tests via http://about.travis-ci.org/docs/
7
+
1
8
  === 0.1.5 23.05.2013
2
9
 
3
10
  * Run under Ruby 1.8.7
data/bin/brand2csv CHANGED
@@ -37,13 +37,13 @@ rescue OptionParser::MissingArgument,
37
37
  end
38
38
 
39
39
 
40
- unless args.size == 1
40
+ unless args.size >= 1
41
41
  puts help
42
42
  exit 1
43
43
  end
44
44
 
45
45
  begin
46
- Brand2csv::run(args[0])
46
+ Brand2csv::run(args[0], args[1])
47
47
  rescue Interrupt
48
48
  puts "Unterbrochen. Breche mit Fehler ab"
49
49
  exit 1
data/lib/brand2csv.rb CHANGED
@@ -27,6 +27,11 @@ module Brand2csv
27
27
  ]
28
28
  Base_uri = 'https://www.swissreg.ch'
29
29
  Start_uri = "#{Base_uri}/srclient/faces/jsp/start.jsp"
30
+ Sr1 = "#{Base_uri}/srclient/faces/jsp/trademark/sr1.jsp"
31
+ Sr2 = "#{Base_uri}/srclient/faces/jsp/trademark/sr2.jsp"
32
+ Sr3 = "#{Base_uri}/srclient/faces/jsp/trademark/sr3.jsp"
33
+ Sr30 = "#{Base_uri}/srclient/faces/jsp/trademark/sr30.jsp"
34
+ Sr300 = "#{Base_uri}/srclient/faces/jsp/trademark/sr300.jsp"
30
35
  AddressRegexp = /^(\d\d\d\d)\W*(.*)/
31
36
  LineSplit = ', '
32
37
  DefaultCountry = 'Schweiz'
@@ -77,20 +82,21 @@ module Brand2csv
77
82
 
78
83
 
79
84
  MaxZeilen = 5
85
+ HitsPerPage = 250
86
+ LogDir = 'mechanize'
80
87
 
81
- attr_accessor :marke
88
+ attr_accessor :marke, :results, :timespan
82
89
 
83
- def initialize(timespan)
90
+ def initialize(timespan, marke = nil)
84
91
  @timespan = timespan
85
- @marke = nil
92
+ @marke = marke
86
93
  @number = nil
87
- @hitsPerPage = 100
88
94
 
89
95
  @agent = Mechanize.new { |agent|
90
96
  agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
91
97
  agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
92
- FileUtils.makedirs 'mechanize' if $VERBOSE
93
- agent.log = Logger.new("mechanize/mechanize.log") if $VERBOSE
98
+ FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
99
+ agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
94
100
  }
95
101
  @results = []
96
102
  @errors = Hash.new
@@ -109,7 +115,6 @@ module Brand2csv
109
115
  # @marke = "*WEIH*"
110
116
  @timespan = nil
111
117
  end
112
- @marke = 'asp*'
113
118
  end
114
119
 
115
120
  def writeResponse(filename)
@@ -141,6 +146,8 @@ module Brand2csv
141
146
  }
142
147
  end
143
148
 
149
+ UseClick = false
150
+
144
151
  def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
145
152
  marke = @marke,
146
153
  nummer =@number) # nummer = "559271" ergibt genau einen treffer
@@ -150,7 +157,7 @@ module Brand2csv
150
157
  # HTTP status code is also strange at redirection.
151
158
  @agent.get Start_uri # get a cookie for the session
152
159
  content = @agent.get_file Start_uri
153
- writeResponse('mechanize/start.jsp')
160
+ writeResponse("#{LogDir}/start.jsp")
154
161
  # get only view state
155
162
  @state = view_state(content)
156
163
  data = [
@@ -160,9 +167,13 @@ module Brand2csv
160
167
  ["id_swissreg:_idcl", "id_swissreg_sub_nav_ipiNavigation_item0"],
161
168
  ["javax.faces.ViewState", @state],
162
169
  ]
163
-
164
- content = @agent.post(Start_uri, data)
165
- writeResponse('mechanize/start2.jsp')
170
+ if UseClick
171
+ Swissreg::setAllInputValue(@agent.page.forms.first, data)
172
+ @agent.page.forms.first.submit
173
+ else
174
+ @agent.post(Start_uri, data)
175
+ end
176
+ writeResponse("#{LogDir}/start2.jsp")
166
177
  # Navigation with mechanize like this fails and returns to the home page
167
178
  # @agent.page.link_with(:id => "id_swissreg_sub_nav_ipiNavigation_item0").click
168
179
 
@@ -174,20 +185,34 @@ module Brand2csv
174
185
  ["javax.faces.ViewState", @state],
175
186
  ]
176
187
  # sr1 ist die einfache suche, sr3 die erweiterte Suche
177
- @path = "/srclient/faces/jsp/trademark/sr3.jsp"
178
- response = @agent.post(Base_uri + @path, data)
179
- writeResponse('mechanize/sr3.jsp')
188
+ if UseClick
189
+ Swissreg::setAllInputValue(@agent.page.forms.first, data)
190
+ @agent.page.forms.first.submit
191
+ else
192
+ @agent.post(Sr3, data)
193
+ end
194
+ writeResponse("#{LogDir}/sr3.jsp")
180
195
 
181
196
  # Fill out form values
182
- @agent.page.form('id_swissreg').checkboxes.each{ |box|
183
- TMChoiceFields.index(box.value) ? box.check : box.uncheck
184
- box.check if $VERBOSE
185
- # select all publication reasons
186
- box.check if /id_ckbTMPubReason/.match(box.name)
187
- # select all publication states
188
- box.check if /id_ckbTMState/.match(box.name)
189
- }
190
- if $VERBOSE # and false # fill all details for marke 567120
197
+ selectedPublicationStates = ['1', '3']
198
+ @agent.page.form('id_swissreg').checkboxes.each{
199
+ |box|
200
+ TMChoiceFields.index(box.value) ? box.check : box.uncheck
201
+ # box.check if $VERBOSE
202
+ # select all publication reasons
203
+ box.check if /id_ckbTMPubReason/.match(box.name)
204
+ # select all publication states or accept default states
205
+ # box.check if /id_ckbTMState/.match(box.name)
206
+ if /id_ckbTMState/.match(box.name)
207
+ if selectedPublicationStates.index(box.value)
208
+ puts "Select id_ckbTMState #{box.value}" if $VERBOSE
209
+ box.check
210
+ else
211
+ box.uncheck
212
+ end
213
+ end
214
+ }
215
+ if $VERBOSE and false # fill all details for marke 567120
191
216
  # Felder, welche nie bei der Antwort auftauchen
192
217
  @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_txf_licensee') { |x| x.value = 'BBB Inc*' }
193
218
  @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_txf_expiryDate') { |x| x.value = timespan }
@@ -203,13 +228,13 @@ module Brand2csv
203
228
  end
204
229
 
205
230
  # Feld, welches im Resultat angezeigt wird
206
- @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_txf_tm_text') { |x| x.value = "asp*" }
231
+ @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_txf_tm_text') { |x| x.value = @marke}
207
232
 
208
233
  # Felder, welches nie bei der Antwort auftaucht. Ein Versuch .gsub('.', '%2E') schlug ebenfalls fehl!
209
234
  @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_txf_appDate') { |x| x.value = timespan}
210
235
 
211
236
  # Feld, welches ebenfalls berücksichtigt wird
212
- @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_cbxHitsPerPage') { |x| x.value = @hitsPerPage }
237
+ @agent.page.form('id_swissreg').field(:name => 'id_swissreg:mainContent:id_cbxHitsPerPage') { |x| x.value = HitsPerPage }
213
238
  @agent.page.form('id_swissreg').field(:name => 'autoScroll') { |x| x.value = '0,0' }
214
239
 
215
240
  if $VERBOSE
@@ -218,165 +243,190 @@ module Brand2csv
218
243
  @agent.page.form('id_swissreg').checkboxes.each{ |box| puts "#{box.name} checked? #{box.checked}"}
219
244
  end
220
245
 
221
- @agent.page.form('id_swissreg').click_button(@agent.page.form('id_swissreg').button_with(:value => "suchen"))
246
+ @criteria = [
247
+ ["autoScroll", "0,829"],
248
+ ["id_swissreg:_link_hidden_", ""],
249
+ ["id_swissreg:mainContent:id_ckbTMState", "1"], # "Hängige Gesuche 1
250
+ # ["id_swissreg:mainContent:id_ckbTMState", "2"], # "Gelöschte Gesuche 2
251
+ ["id_swissreg:mainContent:id_ckbTMState", "3"], # aktive Marken 3
252
+ # ["id_swissreg:mainContent:id_ckbTMState", "4"], # gelöschte Marken 4
253
+ ["id_swissreg:mainContent:id_cbxCountry", "_ALL"], # Auswahl Länder _ALL
254
+ # ["id_swissreg:mainContent:id_txf_tm_no", ""], # Marken Nr
255
+ ["id_swissreg:mainContent:id_txf_tm_no", nummer],# Marken Nr
256
+ ["id_swissreg:mainContent:id_txf_app_no", ""], # Gesuch Nr.
257
+ ["id_swissreg:mainContent:id_txf_tm_text", marke],
258
+ ["id_swissreg:mainContent:id_txf_applicant", ""], # Inhaber/in
259
+ ["id_swissreg:mainContent:id_txf_agent", ""], # Vertreter/in
260
+ ["id_swissreg:mainContent:id_txf_licensee", ""], # Lizenznehmer
261
+ ["id_swissreg:mainContent:id_txf_nizza_class", ""], # Nizza Klassifikation Nr.
262
+ # ["id_swissreg:mainContent:id_txf_appDate", timespan], # Hinterlegungsdatum
263
+ ["id_swissreg:mainContent:id_txf_appDate", timespan] ,
264
+ ["id_swissreg:mainContent:id_txf_expiryDate", ""], # Ablauf Schutzfrist
265
+ # Markenart: Individualmarke 1 Kollektivmarke 2 Garantiemarke 3
266
+ ["id_swissreg:mainContent:id_cbxTMTypeGrp", "_ALL"], # Markenart
267
+ ["id_swissreg:mainContent:id_cbxTMForm", "_ALL"], # Markentyp
268
+ ["id_swissreg:mainContent:id_cbxTMColorClaim", "_ALL"], # Farbanspruch
269
+ ["id_swissreg:mainContent:id_txf_pub_date", ""], # Publikationsdatum
270
+
271
+ # info zu Publikationsgrund id_swissreg:mainContent:id_ckbTMPubReason
272
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "1"], #Neueintragungen
273
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "2"], #Berichtigungen
274
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "3"], #Verlängerungen
275
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "4"], #Löschungen
276
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "5"], #Inhaberänderungen
277
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "6"], #Vertreteränderungen
278
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "7"], #Lizenzänderungen
279
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "8"], #Weitere Registeränderungen
280
+ # ["id_swissreg:mainContent:id_ckbTMEmptyHits", "0"], # Leere Trefferliste anzeigen
281
+
282
+ # "id_swissreg:mainContent:id_cbxFormatChoice" 2 = Publikationsansicht 1 = Registeransicht
283
+ ["id_swissreg:mainContent:id_cbxFormatChoice", "1"],
284
+ ["id_swissreg:mainContent:id_cbxHitsPerPage", HitsPerPage], # Treffer pro Seite
285
+ ]
286
+ TMChoiceFields.each{ | field2display| @criteria << ["id_swissreg:mainContent:id_ckbTMChoice", field2display] }
287
+ # id_swissreg:mainContent:id_ckbTMChoice tm_lbl_tm_text
288
+ puts "Marke ist #{marke}" if marke # Wortlaut der Marke
289
+ puts "Hinterlegungsdatum ist #{timespan}" if $VERBOSE and timespan
290
+ puts "nummer ist #{timespan}" if nummer
291
+ @criteria << ["id_swissreg:mainContent:sub_fieldset:id_submit", "suchen"]
292
+ @criteria << ["id_swissreg_SUBMIT", "1"]
293
+ @criteria << ["id_swissreg:_idcl", ""]
294
+ @criteria << ["id_swissreg:_link_hidden_", ""]
295
+ @criteria << ["javax.faces.ViewState", @state]
296
+
297
+ if true # UseClick
298
+ # Swissreg::setAllInputValue(@agent.page.forms.first, @criteria)
299
+ # setPublicationStates(@agent.page.form('id_swissreg'))
300
+ @agent.page.form('id_swissreg').click_button(@agent.page.form('id_swissreg').button_with(:value => "suchen"))
301
+ else # use post
302
+ writeResponse("#{LogDir}/vor_post_sr3.jsp")
303
+ @agent.post(Sr3, @criteria)
304
+ writeResponse("#{LogDir}/erweiterte_suche.html")
305
+ @agent.page.form('id_swissreg').click_button(@agent.page.form('id_swissreg').button_with(:value => "suchen"))
306
+ end
222
307
  # Hier sollten eigentlich alle Felder auftauchen, wie
223
- # Marke=asp*; Land (Inhaber/in)=Schweiz; Markenart=Alle; Markentyp=Alle; Farbanspruch=Alle; Publikationsgrund= Neueintragungen, Berichtigungen, Verlängerungen, Löschungen, Inhaberänderungen, Vertreteränderungen, Lizenzänderungen, Weitere Registeränderungen; Status= hängige Gesuche, aktive Marken
224
- writeResponse('mechanize/result.jsp')
308
+ # Marke=asp*; Land (Inhaber/in)=Schweiz; Markenart=Alle; Markentyp=Alle; Farbanspruch=Alle; Publikationsgrund= Neueintragungen, Berichtigungen, Verlängerungen, Löschungen, Inhaberänderungen, Vertreteränderungen, Lizenzänderungen, Weitere Registeränderungen; Status= hängige Gesuche, aktive Marken
309
+ writeResponse("#{LogDir}/resultate.jsp")
225
310
  end
226
311
 
227
- def parseAddress(nummer, zeilen)
312
+ # the number is only passed to facilitate debugging
313
+ # lines are the address lines
314
+ def Swissreg::parseAddress(number, lines)
228
315
  ort = nil
229
316
  plz = nil
230
317
 
231
318
  # Search for plz/address
232
- 1.upto(zeilen.length-1).each {
319
+ 1.upto(lines.length-1).each {
233
320
  |cnt|
234
- if m = AddressRegexp.match(zeilen[cnt])
235
- zeilen[cnt+1] = nil
321
+ if m = AddressRegexp.match(lines[cnt])
322
+ lines[cnt+1] = nil
236
323
  plz = m[1]; ort = m[2]
237
- cnt.upto(MaxZeilen-1).each{ |cnt2| zeilen[cnt2] = nil }
324
+ cnt.upto(MaxZeilen-1).each{ |cnt2| lines[cnt2] = nil }
238
325
  break
239
326
  end
240
327
  }
241
328
  unless plz
242
- puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{zeilen.inspect} nicht parsen" if $VERBOSE
329
+ puts "Achtung! Konnte Marke #{number} mit Inhaber #{lines.inspect} nicht parsen" if $VERBOSE
243
330
  return nil, nil, nil, nil, nil, nil, nil, nil
244
331
  end
245
332
  # search for lines with only digits
246
333
  found = false
247
- 1.upto(zeilen.length-1).each {
334
+ 1.upto(lines.length-1).each {
248
335
  |cnt|
249
- break if zeilen[cnt] == nil
250
- if /^\d*$/.match(zeilen[cnt])
336
+ break if lines[cnt] == nil
337
+ if /^\d*$/.match(lines[cnt])
251
338
  found = true
252
- if zeilen[cnt+1] == nil
339
+ if lines[cnt+1] == nil
253
340
  found = 'before'
254
- zeilen[cnt-1] += LineSplit + zeilen[cnt]
255
- zeilen.delete_at(cnt)
341
+ lines[cnt-1] += LineSplit + lines[cnt]
342
+ lines.delete_at(cnt)
256
343
  else
257
344
  found = 'after'
258
- zeilen[cnt] += LineSplit + zeilen[cnt+1]
259
- zeilen.delete_at(cnt+1)
345
+ lines[cnt] += LineSplit + lines[cnt+1]
346
+ lines.delete_at(cnt+1)
260
347
  end
261
348
  end
262
349
  }
263
- puts "found #{found}: #{zeilen.inspect}" if found and $VERBOSE
264
- return zeilen[0], zeilen[1], zeilen[2], zeilen[3], zeilen[4], plz, ort
350
+ puts "found #{found}: #{lines.inspect}" if found and $VERBOSE
351
+ return lines[0], lines[1], lines[2], lines[3], lines[4], plz, ort
265
352
  end
266
353
 
267
- def fetchDetails(nummer) # takes a long time!
268
- @counterDetails += 1
269
- filename = "mechanize/detail_#{nummer}.html"
270
- if File.exists?(filename)
271
- doc = Nokogiri::Slop(File.open(filename))
272
- else
273
- url = "https://www.swissreg.ch/srclient/faces/jsp/trademark/sr300.jsp?language=de&section=tm&id=#{nummer}"
274
- pp "Opening #{url}" if $VERBOSE
275
- content = @agent.get_file url
276
- writeResponse("mechanize/detail_#{nummer}.html")
277
- doc = Nokogiri::Slop(content)
278
- end
279
- puts "Bitte um Geduld. Holte Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
280
- path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
281
- counter = 0
282
- doc.xpath(path_name).each{
283
- |td|
284
- pp "#{counter}: #{td.text}" if $VERBOSE
285
- counter += 1
286
- next unless /^inhaber/i.match(td.text)
287
- zeilen = []
288
- doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text unless child.text.length == 0 } # avoid adding <br>
289
- if info = @errors[nummer]
290
- info.inhaber = zeilen.join(" ")
291
- info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, zeilen)
292
- @results << info
293
- else
294
- bezeichnung = doc.xpath(path_name)[15]
295
- inhaber = zeilen.join(" ")
296
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, zeilen)
297
- hinterlegungsdatum = doc.xpath(path_name)[7]
298
- marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
299
- @results << marke
300
- end
301
- }
354
+ def Swissreg::getInputValuesFromPage(body) # body of HTML page
355
+ contentData = []
356
+ body.search('input').each{ |input|
357
+ # puts "name: #{input.attribute('name')} value #{input.attribute('value')}"
358
+ contentData << [ input.attribute('name').to_s, input.attribute('value').to_s ]
359
+ }
360
+ contentData
361
+ end
362
+
363
+ # return value of an array of POST values
364
+ def Swissreg::inputValue(values, key)
365
+ values.each{ |val|
366
+ return val[1] if key.eql?(val[0])
367
+ }
368
+ return nil
369
+ end
370
+
371
+ # set value for a key of an array of POST values
372
+ def Swissreg::setInputValue(values, key, newValue)
373
+ values.each{ |val|
374
+ if key.eql?(val[0])
375
+ val[1] = newValue
376
+ return
377
+ end
378
+ }
379
+ return
380
+ end
381
+
382
+ def Swissreg::setAllInputValue(form, values)
383
+ values.each{ |newValue|
384
+ # puts "x: 0 #{ newValue[0].to_s} 1 #{newValue[1].to_s}"
385
+ form.field(:name => newValue[0].to_s) { |elem|
386
+ next if elem == nil # puts "Cannot set #{newValue[0].to_s}"
387
+ elem.value = newValue[1].to_s
388
+ }
389
+ }
302
390
  end
303
391
 
304
- def fetchresult(filename = nil, counter = 1)
305
- if filename
306
- doc = Nokogiri::Slop(File.open(filename))
307
- else
308
- body = @agent.page.body
309
- body.force_encoding('utf-8')
310
- doc = Nokogiri::Slop(body)
311
- end
312
- nrFailures = 0
313
- counter += 1
314
- puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'" if $VERBOSE
315
- path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
316
- hasNext = false
317
- doc.xpath(path_name).each{
318
- |elem|
319
- if /scroll_1idx#{counter}/.match(elem.to_s)
320
- hasNext = true
321
- break
322
- end
392
+ def Swissreg::getMarkenInfoFromDetail(doc)
393
+ marke = nil
394
+ number = 'invalid'
395
+ bezeichnung = nil
396
+ inhaber = nil
397
+ hinterlegungsdatum = nil
398
+ zeilen = []
399
+ doc.xpath("//html/body/form/div/div/fieldset/div/table/tbody/tr").each{
400
+ |x|
401
+ if x.children.first.text.eql?('Marke')
402
+ if x.children[1].text.index('Markenabbildung')
403
+ # we must fetch the link to the image
404
+ bezeichnung = x.children[1].elements.first.attribute('href').text
405
+ else # we got a trademark
406
+ bezeichnung = x.children[1].text
407
+ end
408
+ end
409
+ if x.children.first.text.eql?('Inhaber/in')
410
+ inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit)
411
+ x.children[1].children.each{ |child| zeilen << child.text unless child.text.length == 0 } # avoid adding <br>
412
+ end
413
+ hinterlegungsdatum = x.children[1].text if x.children.first.text.eql?('Hinterlegungsdatum')
414
+ number = x.children[1].text if x.children.first.text.eql?('Gesuch Nr.')
323
415
  }
324
- path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tbody/tr"
325
- doc.xpath(path_name).each{
326
- |elem|
327
- bezeichnung = elem.elements[1].text
328
- land = elem.elements[4].text
329
- next unless /#{DefaultCountry}/i.match(land)
330
- inhaber = elem.elements[3].text
331
- nummer = elem.elements[2].text
332
- if bezeichnung.length == 0
333
- bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
334
- end
335
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber.split(LineSplit))
336
- if zeile_1
337
- @results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
338
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
339
- else
340
- nrFailures += 1
341
- @errors[nummer] = Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
342
- zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
343
- end
344
- } if doc.xpath(path_name)
345
- if hasNext
346
- @path = "/srclient/faces/jsp/trademark/sr30.jsp"
347
- puts "Calling sub #{counter} with #{@path}" if $VERBOSE
348
- data = [
349
- ["autoScroll", "0,0"],
350
- ["id_swissreg:mainContent:id_sub_options_result:sub_fieldset:id_cbxHitsPerPage", @hitsPerPage],
351
- # ["id_swissreg:mainContent:vivian", "TRADEMARK REGISTER SEARCH TIMES: QUERY=[20] SELECT=[823] SERVER=[846] DELEGATE=[861] (HITS=[96])"],
352
- ["id_swissreg_SUBMIT", "1"],
353
- ["id_swissreg:_idcl", "id_swissreg:mainContent:scroll_1idx#{counter}"],
354
- ["id_swissreg:mainContent:scroll_1", "idx#{counter}"],
355
- ["tmMainId", ""],
356
- ["id_swissreg:_link_hidden_ "],
357
- ["javax.faces.ViewState", @state],
358
- ]
359
- TMChoiceFields.each{ | field2display| data << ["id_swissreg:mainContent:id_sub_options_result:id_ckbTMChoice", field2display] }
360
- response = @agent.post(Base_uri + @path, data)
361
- writeResponse("mechanize/resultate_#{counter}.html")
362
- checkErrors(response.body)
363
- fetchresult(nil, counter)
364
- else
365
- puts "Es gab #{nrFailures} Fehler beim Lesen von #{filename}" if $VERBOSE
366
- puts "Fand #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'. Von #{@errors.size} muss die Adresse noch geholt werden."
367
- end
416
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = Swissreg::parseAddress(number, zeilen)
417
+ marke = Marke.new(bezeichnung, number, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
368
418
  end
369
-
370
- def emitCsv(filename='ausgabe.csv')
371
- return if @results.size == 0
419
+
420
+ def Swissreg::emitCsv(results, filename='ausgabe.csv')
421
+ return if results == nil or results.size == 0
372
422
  if /^1\.8/.match(RUBY_VERSION)
373
423
  ausgabe = File.open(filename, 'w+')
374
424
  # Write header
375
425
  s=''
376
- @results[0].members.each { |member| s += member + ';' }
426
+ results[0].members.each { |member| s += member + ';' }
377
427
  ausgabe.puts s.chop
378
428
  # write all line
379
- @results.each{
429
+ results.each{
380
430
  |result|
381
431
  s = ''
382
432
  result.members.each{ |member|
@@ -391,28 +441,155 @@ module Brand2csv
391
441
  ausgabe.puts s.chop
392
442
  }
393
443
  else
394
- CSV.open(filename, 'w', :headers=>@results[0].members,
444
+ CSV.open(filename, 'w', :headers=>results[0].members,
395
445
  :write_headers => true,
396
446
  :col_sep => ';'
397
- ) do |csv| @results.each{ |x| csv << x }
447
+ ) do |csv| results.each{ |x| csv << x }
398
448
  end
399
449
  end
400
450
  end
451
+
452
+ class Swissreg::Vereinfachte
453
+ attr_reader :links2details, :trademark_search_id, :inputData, :firstHit, :nrHits, :nrSubPages, :pageNr
454
+ HitRegexpDE = /Seite (\d*) von ([\d']*) - Treffer ([\d']*)-([\d']*) von ([\d']*)/
455
+ Vivian = 'id_swissreg:mainContent:vivian'
456
+
457
+ # Parse a HTML page from swissreg sr3.jsp
458
+ # There we find info like "Seite 1 von 26 - Treffer 1-250 von 6'349" and upto 250 links to details
459
+ def initialize(doc)
460
+ @inputData = []
461
+ m = HitRegexpDE.match(doc.text)
462
+ @pageNr = m[1].sub("'", '').to_i
463
+ @nrSubPages = m[2].sub("'", '').to_i
464
+ @firstHit = m[3].sub("'", '').to_i
465
+ @nrHits = m[5].sub("'", '').to_i
466
+ @trademark_search_id = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), Vivian)
467
+ @links2details = []
468
+ doc.search('input').each{ |input|
469
+ # puts "name: #{input.attribute('name')} value #{input.attribute('value')}" if $VERBOSE
470
+ @inputData << [ input.attribute('name').to_s, input.attribute('value').to_s ]
471
+ }
472
+
473
+ @state = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), 'javax.faces.ViewState')
474
+ doc.search('a').each{
475
+ |link|
476
+ if m = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i.match(link.attribute('id'))
477
+ # puts "XXX #{link.attribute('onclick').to_s} href: #{link.attribute('href').to_s} value #{link.attribute('value').to_s}" if $VERBOSE
478
+ m = /'tmMainId','(\d*)'/.match(link.attribute('onclick').to_s)
479
+ tmMainId = m[1].to_i
480
+ @links2details << tmMainId
481
+ end
482
+ }
483
+ end
484
+
485
+ def getPostDataForDetail(position, id)
486
+ [
487
+ [ "autoScroll", "0,0"],
488
+ [ "id_swissreg:mainContent:sub_options_result:sub_fieldset:cbxHitsPerPage", "#{HitsPerPage}"],
489
+ [ "id_swissreg:mainContent:vivian", @trademark_search_id],
490
+ [ "id_swissreg_SUBMIT", "1"],
491
+ [ "id_swissreg:_idcl", "id_swissreg:mainContent:data:#{position}:tm_no_detail:id_detail", ""],
492
+ [ "id_swissreg:mainContent:scroll_1", ""],
493
+ [ "tmMainId", "#{id}"],
494
+ [ "id_swissreg:_link_hidden_ "],
495
+ [ "javax.faces.ViewState", @state]
496
+ ]
497
+ end
498
+
499
+ def getPostDataForSubpage(pageNr)
500
+ [
501
+ [ "autoScroll", "0,0"],
502
+ [ "id_swissreg:mainContent:sub_options_result:sub_fieldset:cbxHitsPerPage", "#{HitsPerPage}"],
503
+ [ "id_swissreg:mainContent:vivian", @trademark_search_id],
504
+ [ "id_swissreg_SUBMIT", "1"],
505
+ [ "id_swissreg:_idcl", "id_swissreg:mainContent:scroll_1idx#{pageNr}"],
506
+ [ "id_swissreg:mainContent:scroll_1", "idx#{pageNr}"],
507
+ [ "tmMainId", ""],
508
+ [ "id_swissreg:_link_hidden_ "],
509
+ [ "javax.faces.ViewState", @state]
510
+ ]
511
+ end
401
512
 
402
- def fetchMissingDetails
403
- @errors.each{
404
- |markennummer, info|
405
- fetchDetails(markennummer)
513
+ end
514
+
515
+ def getAllHits(filename = nil, pageNr = 1)
516
+ if filename && File.exists?(filename)
517
+ doc = Nokogiri::Slop(File.open(filename))
518
+ else
519
+ body = @agent.page.body
520
+ body.force_encoding('utf-8')
521
+ doc = Nokogiri::Slop(body)
522
+ filename = "#{LogDir}/vereinfachte_#{pageNr}.html"
523
+ writeResponse(filename)
524
+ end
525
+
526
+ einfach = Swissreg::Vereinfachte.new(doc)
527
+ puts "#{Time.now.strftime("%H:%M:%S")} status: fetch #{pageNr} of #{einfach.nrSubPages}"
528
+ subPage2Fetch = pageNr + 1
529
+ data2 = einfach.getPostDataForSubpage(subPage2Fetch).clone
530
+ if (HitsPerPage < einfach.nrHits - einfach.firstHit)
531
+ itemsToFetch = HitsPerPage
532
+ else
533
+ itemsToFetch = einfach.nrHits - einfach.firstHit
534
+ end
535
+ 0.upto(itemsToFetch-1) {
536
+ |position|
537
+ id = einfach.links2details[position]
538
+ nextId = einfach.firstHit.to_i - 1 + position.to_i
539
+ data3 = einfach.getPostDataForDetail(nextId, id)
540
+ Swissreg::setAllInputValue(@agent.page.forms.first, data3)
541
+ @agent.page.forms.first.submit
542
+ filename = "#{LogDir}/vereinfachte_detail_#{einfach.firstHit + position}.html"
543
+ writeResponse(filename)
544
+ matchResult = @agent.page.search('h1').text
545
+ unless /Detailansicht zu (Gesuch|Marke)/.match(matchResult)
546
+ puts matchResult
547
+ puts "Attention did not find 'Detailansicht' in #{filename}. Someting went wrong!"
548
+ break
549
+ end
550
+ @results << Swissreg::getMarkenInfoFromDetail(Nokogiri::Slop(@agent.page.body))
551
+ @agent.back
406
552
  }
553
+ filename = "#{LogDir}/vereinfachte_#{pageNr}_back.html"
554
+ writeResponse(filename)
555
+ if pageNr < (einfach.nrSubPages-1)
556
+ puts "Fetching page #{subPage2Fetch} of #{einfach.nrSubPages}" if $VERBOSE
557
+ Swissreg::setAllInputValue(@agent.page.forms.first, data2)
558
+ @agent.page.forms.first.submit
559
+ getAllHits(nil, subPage2Fetch)
560
+ @agent.back
561
+ end
562
+
407
563
  end
564
+
565
+ def fetchresult(filename = "#{LogDir}/fetch_1.html", counter = 1)
566
+ if filename && File.exists?(filename)
567
+ doc = Nokogiri::Slop(File.open(filename))
568
+ else
569
+ body = @agent.page.body
570
+ body.force_encoding('utf-8')
571
+ doc = Nokogiri::Slop(body)
572
+ writeResponse(filename)
573
+ end
574
+
575
+ if /Vereinfachte Trefferliste anzeigen/i.match(doc.text)
576
+ form = @agent.page.forms.first
577
+ button = form.button_with(:value => /Vereinfachte/i)
578
+ # submit the form using that button
579
+ @agent.submit(form, button)
580
+ filename = "#{LogDir}/vereinfacht.html"
581
+ writeResponse(filename)
582
+ end
583
+ getAllHits(filename, counter)
584
+ end
585
+
408
586
  end # class Swissreg
409
587
 
410
- def Brand2csv::run(timespan)
411
- session = Swissreg.new(timespan)
588
+ def Brand2csv::run(timespan, marke = 'a*')
589
+ session = Swissreg.new(timespan, marke)
412
590
  session.parse_swissreg
413
591
  session.fetchresult
414
- session.fetchMissingDetails
415
- session.emitCsv("#{timespan}.csv")
592
+ Swissreg::emitCsv(session.results, "#{timespan}.csv")
416
593
  end
417
594
 
418
595
  end # module Brand2csv
@@ -1,3 +1,3 @@
1
1
  module Brand2csv
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -1,3 +1,14 @@
1
+ * Work on 2013.05.27
2
+
3
+ * Should be able to fetch up to 10'000 hits.
4
+ ** Problems: Seems to hang silently after a few thousands hits
5
+ ** Cannot limit search to only "Hängige Gesuche" and "Aktive Marken"
6
+
7
+ * Work on 2013.05.26
8
+
9
+ * Added first rake tests to speed up work for 10'000 hits
10
+ * Added second parameter to limit according to trademark name as passing a timespan is not honoured by swissreg
11
+
1
12
  * Work on 2013.05.22
2
13
 
3
14
  ** Use timespan als filename
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brand2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-05-23 00:00:00.000000000 Z
12
+ date: 2013-05-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -124,6 +124,7 @@ files:
124
124
  - spike.rb
125
125
  - spike_mechanize_swissreg.rb
126
126
  - spike_watir.rb
127
+ - .gemtest
127
128
  homepage: https://github.com/zdavatz/brand2csv
128
129
  licenses: []
129
130
  post_install_message: