brand2csv 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,9 +1,5 @@
1
1
  source 'https://rubygems.org'
2
- gem 'mechanize' # , '~> 2.5.1'
3
- gem 'mechanize', :path => '/opt/src/mechanize/pkg' if false
4
- git 'git://github.com/sparklemotion/mechanize.git' do #, :tag => 'v0.13.4'
5
- gem 'mechanize'
6
- end if false
2
+ gem 'mechanize'
7
3
  gem 'json', '~> 1.7.7'
8
4
  gem 'nokogiri'
9
5
 
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.2.2 05.06.2013
2
+
3
+ * Fixed (re-)opening of session
4
+ * Reorganized for long run. Still errors on reconnect
5
+ * Added real fetch from swissreg to specs
6
+
1
7
  === 0.2.1 02.06.2013
2
8
 
3
9
  * Fixed problems with ampersands. Emit clear error when swisssreg.ch not responding correctly
data/lib/brand2csv.rb CHANGED
@@ -33,6 +33,7 @@ module Brand2csv
33
33
  Sr3 = "#{Base_uri}/srclient/faces/jsp/trademark/sr3.jsp"
34
34
  Sr30 = "#{Base_uri}/srclient/faces/jsp/trademark/sr30.jsp"
35
35
  Sr300 = "#{Base_uri}/srclient/faces/jsp/trademark/sr300.jsp"
36
+ DetailRegexp = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i
36
37
  AddressRegexp = /^(\d\d\d\d)\W*(.*)/
37
38
  LineSplit = ', '
38
39
  DefaultCountry = 'Schweiz'
@@ -84,7 +85,7 @@ module Brand2csv
84
85
 
85
86
  MaxZeilen = 5
86
87
  HitsPerPage = 250
87
- LogDir = 'mechanize'
88
+ LogDir = 'log'
88
89
 
89
90
  attr_accessor :marke, :results, :timespan
90
91
 
@@ -92,14 +93,8 @@ module Brand2csv
92
93
  @timespan = timespan
93
94
  @marke = marke
94
95
  @number = nil
95
-
96
- @agent = Mechanize.new { |agent|
97
- agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
98
- agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
99
- FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
100
- agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
101
- }
102
96
  @results = []
97
+ @all_trademark_numbers = []
103
98
  @errors = Hash.new
104
99
  @lastDetail =nil
105
100
  @counterDetails = 0
@@ -131,11 +126,15 @@ module Brand2csv
131
126
 
132
127
  UseClick = false
133
128
 
134
- def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
135
- marke = @marke,
136
- nummer =@number) # nummer = "559271" ergibt genau einen treffer
137
-
129
+ # Initialize a session with swissreg and save the cookie as @state
130
+ def init_swissreg
138
131
  begin
132
+ @agent = Mechanize.new { |agent|
133
+ agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
134
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
135
+ FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
136
+ agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
137
+ }
139
138
  @agent.get_file Start_uri # 'https://www.swissreg.ch/srclient/faces/jsp/start.jsp'
140
139
  writeResponse("#{LogDir}/session_expired.html")
141
140
  checkErrors(@agent.page.body, false)
@@ -146,6 +145,13 @@ module Brand2csv
146
145
  puts "Net::HTTPInternalServerError oder Mechanize::ResponseCodeError gesehen.\n #{Base_uri} hat wahrscheinlich Probleme"
147
146
  exit 3
148
147
  end
148
+ end
149
+
150
+ def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
151
+ marke = @marke,
152
+ nummer =@number) # nummer = "559271" ergibt genau einen treffer
153
+
154
+ init_swissreg
149
155
  data = [
150
156
  ["autoScroll", "0,0"],
151
157
  ["id_swissreg:_link_hidden_", ""],
@@ -303,7 +309,6 @@ module Brand2csv
303
309
  bezeichnung = nil
304
310
  inhaber = nil
305
311
  hinterlegungsdatum = nil
306
- zeilen = []
307
312
  doc.xpath("//html/body/form/div/div/fieldset/div/table/tbody/tr").each{
308
313
  |x|
309
314
  if x.children.first.text.eql?('Marke')
@@ -314,9 +319,8 @@ module Brand2csv
314
319
  bezeichnung = x.children[1].text
315
320
  end
316
321
  end
322
+
317
323
  if x.children.first.text.eql?('Inhaber/in')
318
- # inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit).gsub('&amp;', '&')
319
- # x.children[1].children.each{ |child| zeilen << child.text.gsub('&amp;', '&') unless child.text.length == 0 } # avoid adding <br>
320
324
  inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit)
321
325
  end
322
326
  hinterlegungsdatum = x.children[1].text if x.children.first.text.eql?('Hinterlegungsdatum')
@@ -326,6 +330,26 @@ module Brand2csv
326
330
  marke = Marke.new(bezeichnung, number, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
327
331
  end
328
332
 
333
+ def fetchDetails(nummer) # takes a long time!
334
+ @counterDetails += 1
335
+ init_swissreg if @counterDetails % 90 == 0 # it seems that swissreg is artificially slowing down serving request after 100 hits
336
+ filename = "#{LogDir}/detail_#{sprintf('%05d', @counterDetails)}_#{nummer.gsub('/','.')}.html"
337
+ if File.exists?(filename)
338
+ doc = Nokogiri::Slop(File.open(filename))
339
+ else
340
+ url = "#{Sr300}?language=de&section=tm&id=#{nummer}"
341
+ pp "#{Time.now.strftime("%H:%M:%S")}: Opening #{filename}" if $VERBOSE
342
+ $stdout.flush
343
+ content = @agent.get_file url
344
+ body = @agent.page.body
345
+ body.force_encoding('utf-8') unless /^1\.8/.match(RUBY_VERSION)
346
+ doc = Nokogiri::Slop(body)
347
+ writeResponse(filename)
348
+ end
349
+ marke = Swissreg::getMarkenInfoFromDetail(doc)
350
+ @results << marke
351
+ end
352
+
329
353
  def Swissreg::emitCsv(results, filename='ausgabe.csv')
330
354
  return if results == nil or results.size == 0
331
355
  if /^1\.8/.match(RUBY_VERSION)
@@ -360,6 +384,17 @@ module Brand2csv
360
384
  end
361
385
  end
362
386
 
387
+ def Swissreg::getTrademarkNumbers(doc)
388
+ trademark_numbers = []
389
+ doc.search('a').each{
390
+ |link|
391
+ if DetailRegexp.match(link.attribute('id'))
392
+ trademark_numbers << link.children.first.children.first.content
393
+ end
394
+ }
395
+ trademark_numbers
396
+ end
397
+
363
398
  class Swissreg::Vereinfachte
364
399
  attr_reader :links2details, :trademark_search_id, :inputData, :firstHit, :nrHits, :nrSubPages, :pageNr
365
400
  HitRegexpDE = /Seite (\d*) von ([\d']*) - Treffer ([\d']*)-([\d']*) von ([\d']*)/
@@ -384,7 +419,7 @@ module Brand2csv
384
419
  @state = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), 'javax.faces.ViewState')
385
420
  doc.search('a').each{
386
421
  |link|
387
- if m = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i.match(link.attribute('id'))
422
+ if m = DetailRegexp.match(link.attribute('id'))
388
423
  # puts "XXX #{link.attribute('onclick').to_s} href: #{link.attribute('href').to_s} value #{link.attribute('value').to_s}" if $VERBOSE
389
424
  m = /'tmMainId','(\d*)'/.match(link.attribute('onclick').to_s)
390
425
  tmMainId = m[1].to_i
@@ -422,7 +457,7 @@ module Brand2csv
422
457
  end
423
458
 
424
459
  end
425
-
460
+
426
461
  def getAllHits(filename = nil, pageNr = 1)
427
462
  if filename && File.exists?(filename)
428
463
  doc = Nokogiri::Slop(File.open(filename))
@@ -435,7 +470,7 @@ module Brand2csv
435
470
  end
436
471
 
437
472
  einfach = Swissreg::Vereinfachte.new(doc)
438
- puts "#{Time.now.strftime("%H:%M:%S")} status: fetch #{pageNr} of #{einfach.nrSubPages}"
473
+ puts "#{Time.now.strftime("%H:%M:%S")} status: getAllHits for #{pageNr} of #{einfach.nrSubPages} pages" if $VERBOSE
439
474
  subPage2Fetch = pageNr + 1
440
475
  data2 = einfach.getPostDataForSubpage(subPage2Fetch).clone
441
476
  if (HitsPerPage < einfach.nrHits - einfach.firstHit)
@@ -443,46 +478,16 @@ module Brand2csv
443
478
  else
444
479
  itemsToFetch = einfach.nrHits - einfach.firstHit
445
480
  end
446
- 0.upto(itemsToFetch-1) {
447
- |position|
448
- id = einfach.links2details[position]
449
- nextId = einfach.firstHit.to_i - 1 + position.to_i
450
- data3 = einfach.getPostDataForDetail(nextId, id)
451
- Swissreg::setAllInputValue(@agent.page.forms.first, data3)
452
- nrTries = 1
453
- while true
454
- begin
455
- @agent.page.forms.first.submit
456
- break
457
- rescue
458
- puts "Rescue in submit. nrTries is #{nrTries}. Retry after a few seconds"
459
- nrTries += 1
460
- sleep 10
461
- exit 1 if nrTries > 3
462
- end
463
- end
464
- filename = "#{LogDir}/vereinfachte_detail_#{einfach.firstHit + position}.html"
465
- writeResponse(filename)
466
- matchResult = @agent.page.search('h1').text
467
- unless /Detailansicht zu (Gesuch|Marke)/.match(matchResult)
468
- puts matchResult
469
- puts "Attention did not find 'Detailansicht' in #{filename}. Someting went wrong!"
470
- break
471
- end
472
- @results << Swissreg::getMarkenInfoFromDetail(Nokogiri::Slop(@agent.page.body))
473
- @agent.back
474
- sleep 1
475
- }
481
+ @all_trademark_numbers += Swissreg::getTrademarkNumbers(doc)
482
+
476
483
  filename = "#{LogDir}/vereinfachte_#{pageNr}_back.html"
477
484
  writeResponse(filename)
478
- if pageNr < (einfach.nrSubPages-1)
479
- puts "Fetching page #{subPage2Fetch} of #{einfach.nrSubPages}" if $VERBOSE
480
- Swissreg::setAllInputValue(@agent.page.forms.first, data2)
481
- @agent.page.forms.first.submit
482
- getAllHits(nil, subPage2Fetch)
483
- @agent.back
484
- end
485
-
485
+ if pageNr < (einfach.nrSubPages)
486
+ Swissreg::setAllInputValue(@agent.page.forms.first, data2)
487
+ @agent.page.forms.first.submit
488
+ getAllHits(nil, subPage2Fetch)
489
+ end
490
+ @all_trademark_numbers
486
491
  end
487
492
 
488
493
  def fetchresult(filename = "#{LogDir}/fetch_1.html", counter = 1)
@@ -503,11 +508,33 @@ module Brand2csv
503
508
  filename = "#{LogDir}/vereinfacht.html"
504
509
  writeResponse(filename)
505
510
  end
506
- getAllHits(filename, counter)
511
+ getAllHits(doc, counter)
512
+ puts"getAllHits: returned #{@all_trademark_numbers ? @all_trademark_numbers.size : 0} hits "
513
+ if @all_trademark_numbers
514
+ @all_trademark_numbers.each{
515
+ |nr|
516
+ nrRetries = 0
517
+ begin
518
+ fetchDetails(nr)
519
+ rescue Exception => e
520
+ nrRetries += 1
521
+ puts e.backtrace
522
+ puts "fetchDetails did not work reinit session and retry for #{nr}. nrRetries #{nrRetries}/3. e #{e}"
523
+ if nrRetries <= 3
524
+ init_swissreg
525
+ retry
526
+ else
527
+ raise Interrupt
528
+ end
529
+ end
530
+
531
+ }
532
+ else
533
+ puts "Could not find any trademarks in #{filename}"
534
+ end
507
535
  end
508
-
509
536
  end # class Swissreg
510
-
537
+
511
538
  def Brand2csv::run(timespan, marke = 'a*')
512
539
  session = Swissreg.new(timespan, marke)
513
540
  begin
@@ -517,6 +544,7 @@ module Brand2csv
517
544
  puts "Unterbrochen. Vesuche #{session.results.size} Resultate zu speichern"
518
545
  end
519
546
  Swissreg::emitCsv(session.results, "#{timespan}.csv")
547
+ session.results
520
548
  end
521
549
 
522
550
  end # module Brand2csv
@@ -1,3 +1,3 @@
1
1
  module Brand2csv
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: brand2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-06-02 00:00:00.000000000 Z
12
+ date: 2013-06-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
@@ -148,7 +148,6 @@ files:
148
148
  - spike_mechanize_swissreg.rb
149
149
  - spike_watir.rb
150
150
  - swissreg.rb
151
- - .gemtest
152
151
  homepage: https://github.com/zdavatz/brand2csv
153
152
  licenses: []
154
153
  post_install_message:
data/.gemtest DELETED
File without changes