brand2csv 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,9 +1,5 @@
1
1
  source 'https://rubygems.org'
2
- gem 'mechanize' # , '~> 2.5.1'
3
- gem 'mechanize', :path => '/opt/src/mechanize/pkg' if false
4
- git 'git://github.com/sparklemotion/mechanize.git' do #, :tag => 'v0.13.4'
5
- gem 'mechanize'
6
- end if false
2
+ gem 'mechanize'
7
3
  gem 'json', '~> 1.7.7'
8
4
  gem 'nokogiri'
9
5
 
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.2.2 05.06.2013
2
+
3
+ * Fixed (re-)opening of session
4
+ * Reorganized for long run. Still errors on reconnect
5
+ * Added real fetch from swissreg to specs
6
+
1
7
  === 0.2.1 02.06.2013
2
8
 
3
9
  * Fixed problems with ampersands. Emit clear error when swisssreg.ch not responding correctly
data/lib/brand2csv.rb CHANGED
@@ -33,6 +33,7 @@ module Brand2csv
33
33
  Sr3 = "#{Base_uri}/srclient/faces/jsp/trademark/sr3.jsp"
34
34
  Sr30 = "#{Base_uri}/srclient/faces/jsp/trademark/sr30.jsp"
35
35
  Sr300 = "#{Base_uri}/srclient/faces/jsp/trademark/sr300.jsp"
36
+ DetailRegexp = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i
36
37
  AddressRegexp = /^(\d\d\d\d)\W*(.*)/
37
38
  LineSplit = ', '
38
39
  DefaultCountry = 'Schweiz'
@@ -84,7 +85,7 @@ module Brand2csv
84
85
 
85
86
  MaxZeilen = 5
86
87
  HitsPerPage = 250
87
- LogDir = 'mechanize'
88
+ LogDir = 'log'
88
89
 
89
90
  attr_accessor :marke, :results, :timespan
90
91
 
@@ -92,14 +93,8 @@ module Brand2csv
92
93
  @timespan = timespan
93
94
  @marke = marke
94
95
  @number = nil
95
-
96
- @agent = Mechanize.new { |agent|
97
- agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
98
- agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
99
- FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
100
- agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
101
- }
102
96
  @results = []
97
+ @all_trademark_numbers = []
103
98
  @errors = Hash.new
104
99
  @lastDetail =nil
105
100
  @counterDetails = 0
@@ -131,11 +126,15 @@ module Brand2csv
131
126
 
132
127
  UseClick = false
133
128
 
134
- def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
135
- marke = @marke,
136
- nummer =@number) # nummer = "559271" ergibt genau einen treffer
137
-
129
+ # Initialize a session with swissreg and save the cookie as @state
130
+ def init_swissreg
138
131
  begin
132
+ @agent = Mechanize.new { |agent|
133
+ agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
134
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
135
+ FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
136
+ agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
137
+ }
139
138
  @agent.get_file Start_uri # 'https://www.swissreg.ch/srclient/faces/jsp/start.jsp'
140
139
  writeResponse("#{LogDir}/session_expired.html")
141
140
  checkErrors(@agent.page.body, false)
@@ -146,6 +145,13 @@ module Brand2csv
146
145
  puts "Net::HTTPInternalServerError oder Mechanize::ResponseCodeError gesehen.\n #{Base_uri} hat wahrscheinlich Probleme"
147
146
  exit 3
148
147
  end
148
+ end
149
+
150
+ def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
151
+ marke = @marke,
152
+ nummer =@number) # nummer = "559271" ergibt genau einen treffer
153
+
154
+ init_swissreg
149
155
  data = [
150
156
  ["autoScroll", "0,0"],
151
157
  ["id_swissreg:_link_hidden_", ""],
@@ -303,7 +309,6 @@ module Brand2csv
303
309
  bezeichnung = nil
304
310
  inhaber = nil
305
311
  hinterlegungsdatum = nil
306
- zeilen = []
307
312
  doc.xpath("//html/body/form/div/div/fieldset/div/table/tbody/tr").each{
308
313
  |x|
309
314
  if x.children.first.text.eql?('Marke')
@@ -314,9 +319,8 @@ module Brand2csv
314
319
  bezeichnung = x.children[1].text
315
320
  end
316
321
  end
322
+
317
323
  if x.children.first.text.eql?('Inhaber/in')
318
- # inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit).gsub('&amp;', '&')
319
- # x.children[1].children.each{ |child| zeilen << child.text.gsub('&amp;', '&') unless child.text.length == 0 } # avoid adding <br>
320
324
  inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit)
321
325
  end
322
326
  hinterlegungsdatum = x.children[1].text if x.children.first.text.eql?('Hinterlegungsdatum')
@@ -326,6 +330,26 @@ module Brand2csv
326
330
  marke = Marke.new(bezeichnung, number, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
327
331
  end
328
332
 
333
+ def fetchDetails(nummer) # takes a long time!
334
+ @counterDetails += 1
335
+ init_swissreg if @counterDetails % 90 == 0 # it seems that swissreg is artificially slowing down serving request after 100 hits
336
+ filename = "#{LogDir}/detail_#{sprintf('%05d', @counterDetails)}_#{nummer.gsub('/','.')}.html"
337
+ if File.exists?(filename)
338
+ doc = Nokogiri::Slop(File.open(filename))
339
+ else
340
+ url = "#{Sr300}?language=de&section=tm&id=#{nummer}"
341
+ pp "#{Time.now.strftime("%H:%M:%S")}: Opening #{filename}" if $VERBOSE
342
+ $stdout.flush
343
+ content = @agent.get_file url
344
+ body = @agent.page.body
345
+ body.force_encoding('utf-8') unless /^1\.8/.match(RUBY_VERSION)
346
+ doc = Nokogiri::Slop(body)
347
+ writeResponse(filename)
348
+ end
349
+ marke = Swissreg::getMarkenInfoFromDetail(doc)
350
+ @results << marke
351
+ end
352
+
329
353
  def Swissreg::emitCsv(results, filename='ausgabe.csv')
330
354
  return if results == nil or results.size == 0
331
355
  if /^1\.8/.match(RUBY_VERSION)
@@ -360,6 +384,17 @@ module Brand2csv
360
384
  end
361
385
  end
362
386
 
387
+ def Swissreg::getTrademarkNumbers(doc)
388
+ trademark_numbers = []
389
+ doc.search('a').each{
390
+ |link|
391
+ if DetailRegexp.match(link.attribute('id'))
392
+ trademark_numbers << link.children.first.children.first.content
393
+ end
394
+ }
395
+ trademark_numbers
396
+ end
397
+
363
398
  class Swissreg::Vereinfachte
364
399
  attr_reader :links2details, :trademark_search_id, :inputData, :firstHit, :nrHits, :nrSubPages, :pageNr
365
400
  HitRegexpDE = /Seite (\d*) von ([\d']*) - Treffer ([\d']*)-([\d']*) von ([\d']*)/
@@ -384,7 +419,7 @@ module Brand2csv
384
419
  @state = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), 'javax.faces.ViewState')
385
420
  doc.search('a').each{
386
421
  |link|
387
- if m = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i.match(link.attribute('id'))
422
+ if m = DetailRegexp.match(link.attribute('id'))
388
423
  # puts "XXX #{link.attribute('onclick').to_s} href: #{link.attribute('href').to_s} value #{link.attribute('value').to_s}" if $VERBOSE
389
424
  m = /'tmMainId','(\d*)'/.match(link.attribute('onclick').to_s)
390
425
  tmMainId = m[1].to_i
@@ -422,7 +457,7 @@ module Brand2csv
422
457
  end
423
458
 
424
459
  end
425
-
460
+
426
461
  def getAllHits(filename = nil, pageNr = 1)
427
462
  if filename && File.exists?(filename)
428
463
  doc = Nokogiri::Slop(File.open(filename))
@@ -435,7 +470,7 @@ module Brand2csv
435
470
  end
436
471
 
437
472
  einfach = Swissreg::Vereinfachte.new(doc)
438
- puts "#{Time.now.strftime("%H:%M:%S")} status: fetch #{pageNr} of #{einfach.nrSubPages}"
473
+ puts "#{Time.now.strftime("%H:%M:%S")} status: getAllHits for #{pageNr} of #{einfach.nrSubPages} pages" if $VERBOSE
439
474
  subPage2Fetch = pageNr + 1
440
475
  data2 = einfach.getPostDataForSubpage(subPage2Fetch).clone
441
476
  if (HitsPerPage < einfach.nrHits - einfach.firstHit)
@@ -443,46 +478,16 @@ module Brand2csv
443
478
  else
444
479
  itemsToFetch = einfach.nrHits - einfach.firstHit
445
480
  end
446
- 0.upto(itemsToFetch-1) {
447
- |position|
448
- id = einfach.links2details[position]
449
- nextId = einfach.firstHit.to_i - 1 + position.to_i
450
- data3 = einfach.getPostDataForDetail(nextId, id)
451
- Swissreg::setAllInputValue(@agent.page.forms.first, data3)
452
- nrTries = 1
453
- while true
454
- begin
455
- @agent.page.forms.first.submit
456
- break
457
- rescue
458
- puts "Rescue in submit. nrTries is #{nrTries}. Retry after a few seconds"
459
- nrTries += 1
460
- sleep 10
461
- exit 1 if nrTries > 3
462
- end
463
- end
464
- filename = "#{LogDir}/vereinfachte_detail_#{einfach.firstHit + position}.html"
465
- writeResponse(filename)
466
- matchResult = @agent.page.search('h1').text
467
- unless /Detailansicht zu (Gesuch|Marke)/.match(matchResult)
468
- puts matchResult
469
- puts "Attention did not find 'Detailansicht' in #{filename}. Someting went wrong!"
470
- break
471
- end
472
- @results << Swissreg::getMarkenInfoFromDetail(Nokogiri::Slop(@agent.page.body))
473
- @agent.back
474
- sleep 1
475
- }
481
+ @all_trademark_numbers += Swissreg::getTrademarkNumbers(doc)
482
+
476
483
  filename = "#{LogDir}/vereinfachte_#{pageNr}_back.html"
477
484
  writeResponse(filename)
478
- if pageNr < (einfach.nrSubPages-1)
479
- puts "Fetching page #{subPage2Fetch} of #{einfach.nrSubPages}" if $VERBOSE
480
- Swissreg::setAllInputValue(@agent.page.forms.first, data2)
481
- @agent.page.forms.first.submit
482
- getAllHits(nil, subPage2Fetch)
483
- @agent.back
484
- end
485
-
485
+ if pageNr < (einfach.nrSubPages)
486
+ Swissreg::setAllInputValue(@agent.page.forms.first, data2)
487
+ @agent.page.forms.first.submit
488
+ getAllHits(nil, subPage2Fetch)
489
+ end
490
+ @all_trademark_numbers
486
491
  end
487
492
 
488
493
  def fetchresult(filename = "#{LogDir}/fetch_1.html", counter = 1)
@@ -503,11 +508,33 @@ module Brand2csv
503
508
  filename = "#{LogDir}/vereinfacht.html"
504
509
  writeResponse(filename)
505
510
  end
506
- getAllHits(filename, counter)
511
+ getAllHits(doc, counter)
512
+ puts"getAllHits: returned #{@all_trademark_numbers ? @all_trademark_numbers.size : 0} hits "
513
+ if @all_trademark_numbers
514
+ @all_trademark_numbers.each{
515
+ |nr|
516
+ nrRetries = 0
517
+ begin
518
+ fetchDetails(nr)
519
+ rescue Exception => e
520
+ nrRetries += 1
521
+ puts e.backtrace
522
+ puts "fetchDetails did not work reinit session and retry for #{nr}. nrRetries #{nrRetries}/3. e #{e}"
523
+ if nrRetries <= 3
524
+ init_swissreg
525
+ retry
526
+ else
527
+ raise Interrupt
528
+ end
529
+ end
530
+
531
+ }
532
+ else
533
+ puts "Could not find any trademarks in #{filename}"
534
+ end
507
535
  end
508
-
509
536
  end # class Swissreg
510
-
537
+
511
538
  def Brand2csv::run(timespan, marke = 'a*')
512
539
  session = Swissreg.new(timespan, marke)
513
540
  begin
@@ -517,6 +544,7 @@ module Brand2csv
517
544
  puts "Unterbrochen. Vesuche #{session.results.size} Resultate zu speichern"
518
545
  end
519
546
  Swissreg::emitCsv(session.results, "#{timespan}.csv")
547
+ session.results
520
548
  end
521
549
 
522
550
  end # module Brand2csv
@@ -1,3 +1,3 @@
1
1
  module Brand2csv
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brand2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-02 00:00:00.000000000 Z
12
+ date: 2013-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -148,7 +148,6 @@ files:
148
148
  - spike_mechanize_swissreg.rb
149
149
  - spike_watir.rb
150
150
  - swissreg.rb
151
- - .gemtest
152
151
  homepage: https://github.com/zdavatz/brand2csv
153
152
  licenses: []
154
153
  post_install_message:
data/.gemtest DELETED
File without changes