brand2csv 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -5
- data/History.txt +6 -0
- data/lib/brand2csv.rb +87 -59
- data/lib/brand2csv/version.rb +1 -1
- metadata +2 -3
- data/.gemtest +0 -0
data/Gemfile
CHANGED
@@ -1,9 +1,5 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
|
-
gem 'mechanize'
|
3
|
-
gem 'mechanize', :path => '/opt/src/mechanize/pkg' if false
|
4
|
-
git 'git://github.com/sparklemotion/mechanize.git' do #, :tag => 'v0.13.4'
|
5
|
-
gem 'mechanize'
|
6
|
-
end if false
|
2
|
+
gem 'mechanize'
|
7
3
|
gem 'json', '~> 1.7.7'
|
8
4
|
gem 'nokogiri'
|
9
5
|
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.2.2 05.06.2013
|
2
|
+
|
3
|
+
* Fixed (re-)opening of session
|
4
|
+
* Reorganized for long run. Still errors on reconnect
|
5
|
+
* Added real fetch from swissreg to specs
|
6
|
+
|
1
7
|
=== 0.2.1 02.06.2013
|
2
8
|
|
3
9
|
* Fixed problems with ampersands. Emit clear error when swisssreg.ch not responding correctly
|
data/lib/brand2csv.rb
CHANGED
@@ -33,6 +33,7 @@ module Brand2csv
|
|
33
33
|
Sr3 = "#{Base_uri}/srclient/faces/jsp/trademark/sr3.jsp"
|
34
34
|
Sr30 = "#{Base_uri}/srclient/faces/jsp/trademark/sr30.jsp"
|
35
35
|
Sr300 = "#{Base_uri}/srclient/faces/jsp/trademark/sr300.jsp"
|
36
|
+
DetailRegexp = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i
|
36
37
|
AddressRegexp = /^(\d\d\d\d)\W*(.*)/
|
37
38
|
LineSplit = ', '
|
38
39
|
DefaultCountry = 'Schweiz'
|
@@ -84,7 +85,7 @@ module Brand2csv
|
|
84
85
|
|
85
86
|
MaxZeilen = 5
|
86
87
|
HitsPerPage = 250
|
87
|
-
LogDir = '
|
88
|
+
LogDir = 'log'
|
88
89
|
|
89
90
|
attr_accessor :marke, :results, :timespan
|
90
91
|
|
@@ -92,14 +93,8 @@ module Brand2csv
|
|
92
93
|
@timespan = timespan
|
93
94
|
@marke = marke
|
94
95
|
@number = nil
|
95
|
-
|
96
|
-
@agent = Mechanize.new { |agent|
|
97
|
-
agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
|
98
|
-
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
99
|
-
FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
|
100
|
-
agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
|
101
|
-
}
|
102
96
|
@results = []
|
97
|
+
@all_trademark_numbers = []
|
103
98
|
@errors = Hash.new
|
104
99
|
@lastDetail =nil
|
105
100
|
@counterDetails = 0
|
@@ -131,11 +126,15 @@ module Brand2csv
|
|
131
126
|
|
132
127
|
UseClick = false
|
133
128
|
|
134
|
-
|
135
|
-
|
136
|
-
nummer =@number) # nummer = "559271" ergibt genau einen treffer
|
137
|
-
|
129
|
+
# Initialize a session with swissreg and save the cookie as @state
|
130
|
+
def init_swissreg
|
138
131
|
begin
|
132
|
+
@agent = Mechanize.new { |agent|
|
133
|
+
agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
|
134
|
+
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
135
|
+
FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
|
136
|
+
agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
|
137
|
+
}
|
139
138
|
@agent.get_file Start_uri # 'https://www.swissreg.ch/srclient/faces/jsp/start.jsp'
|
140
139
|
writeResponse("#{LogDir}/session_expired.html")
|
141
140
|
checkErrors(@agent.page.body, false)
|
@@ -146,6 +145,13 @@ module Brand2csv
|
|
146
145
|
puts "Net::HTTPInternalServerError oder Mechanize::ResponseCodeError gesehen.\n #{Base_uri} hat wahrscheinlich Probleme"
|
147
146
|
exit 3
|
148
147
|
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
|
151
|
+
marke = @marke,
|
152
|
+
nummer =@number) # nummer = "559271" ergibt genau einen treffer
|
153
|
+
|
154
|
+
init_swissreg
|
149
155
|
data = [
|
150
156
|
["autoScroll", "0,0"],
|
151
157
|
["id_swissreg:_link_hidden_", ""],
|
@@ -303,7 +309,6 @@ module Brand2csv
|
|
303
309
|
bezeichnung = nil
|
304
310
|
inhaber = nil
|
305
311
|
hinterlegungsdatum = nil
|
306
|
-
zeilen = []
|
307
312
|
doc.xpath("//html/body/form/div/div/fieldset/div/table/tbody/tr").each{
|
308
313
|
|x|
|
309
314
|
if x.children.first.text.eql?('Marke')
|
@@ -314,9 +319,8 @@ module Brand2csv
|
|
314
319
|
bezeichnung = x.children[1].text
|
315
320
|
end
|
316
321
|
end
|
322
|
+
|
317
323
|
if x.children.first.text.eql?('Inhaber/in')
|
318
|
-
# inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit).gsub('&', '&')
|
319
|
-
# x.children[1].children.each{ |child| zeilen << child.text.gsub('&', '&') unless child.text.length == 0 } # avoid adding <br>
|
320
324
|
inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit)
|
321
325
|
end
|
322
326
|
hinterlegungsdatum = x.children[1].text if x.children.first.text.eql?('Hinterlegungsdatum')
|
@@ -326,6 +330,26 @@ module Brand2csv
|
|
326
330
|
marke = Marke.new(bezeichnung, number, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
327
331
|
end
|
328
332
|
|
333
|
+
def fetchDetails(nummer) # takes a long time!
|
334
|
+
@counterDetails += 1
|
335
|
+
init_swissreg if @counterDetails % 90 == 0 # it seems that swissreg is artificially slowing down serving request after 100 hits
|
336
|
+
filename = "#{LogDir}/detail_#{sprintf('%05d', @counterDetails)}_#{nummer.gsub('/','.')}.html"
|
337
|
+
if File.exists?(filename)
|
338
|
+
doc = Nokogiri::Slop(File.open(filename))
|
339
|
+
else
|
340
|
+
url = "#{Sr300}?language=de§ion=tm&id=#{nummer}"
|
341
|
+
pp "#{Time.now.strftime("%H:%M:%S")}: Opening #{filename}" if $VERBOSE
|
342
|
+
$stdout.flush
|
343
|
+
content = @agent.get_file url
|
344
|
+
body = @agent.page.body
|
345
|
+
body.force_encoding('utf-8') unless /^1\.8/.match(RUBY_VERSION)
|
346
|
+
doc = Nokogiri::Slop(body)
|
347
|
+
writeResponse(filename)
|
348
|
+
end
|
349
|
+
marke = Swissreg::getMarkenInfoFromDetail(doc)
|
350
|
+
@results << marke
|
351
|
+
end
|
352
|
+
|
329
353
|
def Swissreg::emitCsv(results, filename='ausgabe.csv')
|
330
354
|
return if results == nil or results.size == 0
|
331
355
|
if /^1\.8/.match(RUBY_VERSION)
|
@@ -360,6 +384,17 @@ module Brand2csv
|
|
360
384
|
end
|
361
385
|
end
|
362
386
|
|
387
|
+
def Swissreg::getTrademarkNumbers(doc)
|
388
|
+
trademark_numbers = []
|
389
|
+
doc.search('a').each{
|
390
|
+
|link|
|
391
|
+
if DetailRegexp.match(link.attribute('id'))
|
392
|
+
trademark_numbers << link.children.first.children.first.content
|
393
|
+
end
|
394
|
+
}
|
395
|
+
trademark_numbers
|
396
|
+
end
|
397
|
+
|
363
398
|
class Swissreg::Vereinfachte
|
364
399
|
attr_reader :links2details, :trademark_search_id, :inputData, :firstHit, :nrHits, :nrSubPages, :pageNr
|
365
400
|
HitRegexpDE = /Seite (\d*) von ([\d']*) - Treffer ([\d']*)-([\d']*) von ([\d']*)/
|
@@ -384,7 +419,7 @@ module Brand2csv
|
|
384
419
|
@state = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), 'javax.faces.ViewState')
|
385
420
|
doc.search('a').each{
|
386
421
|
|link|
|
387
|
-
if m =
|
422
|
+
if m = DetailRegexp.match(link.attribute('id'))
|
388
423
|
# puts "XXX #{link.attribute('onclick').to_s} href: #{link.attribute('href').to_s} value #{link.attribute('value').to_s}" if $VERBOSE
|
389
424
|
m = /'tmMainId','(\d*)'/.match(link.attribute('onclick').to_s)
|
390
425
|
tmMainId = m[1].to_i
|
@@ -422,7 +457,7 @@ module Brand2csv
|
|
422
457
|
end
|
423
458
|
|
424
459
|
end
|
425
|
-
|
460
|
+
|
426
461
|
def getAllHits(filename = nil, pageNr = 1)
|
427
462
|
if filename && File.exists?(filename)
|
428
463
|
doc = Nokogiri::Slop(File.open(filename))
|
@@ -435,7 +470,7 @@ module Brand2csv
|
|
435
470
|
end
|
436
471
|
|
437
472
|
einfach = Swissreg::Vereinfachte.new(doc)
|
438
|
-
puts "#{Time.now.strftime("%H:%M:%S")} status:
|
473
|
+
puts "#{Time.now.strftime("%H:%M:%S")} status: getAllHits for #{pageNr} of #{einfach.nrSubPages} pages" if $VERBOSE
|
439
474
|
subPage2Fetch = pageNr + 1
|
440
475
|
data2 = einfach.getPostDataForSubpage(subPage2Fetch).clone
|
441
476
|
if (HitsPerPage < einfach.nrHits - einfach.firstHit)
|
@@ -443,46 +478,16 @@ module Brand2csv
|
|
443
478
|
else
|
444
479
|
itemsToFetch = einfach.nrHits - einfach.firstHit
|
445
480
|
end
|
446
|
-
|
447
|
-
|
448
|
-
id = einfach.links2details[position]
|
449
|
-
nextId = einfach.firstHit.to_i - 1 + position.to_i
|
450
|
-
data3 = einfach.getPostDataForDetail(nextId, id)
|
451
|
-
Swissreg::setAllInputValue(@agent.page.forms.first, data3)
|
452
|
-
nrTries = 1
|
453
|
-
while true
|
454
|
-
begin
|
455
|
-
@agent.page.forms.first.submit
|
456
|
-
break
|
457
|
-
rescue
|
458
|
-
puts "Rescue in submit. nrTries is #{nrTries}. Retry after a few seconds"
|
459
|
-
nrTries += 1
|
460
|
-
sleep 10
|
461
|
-
exit 1 if nrTries > 3
|
462
|
-
end
|
463
|
-
end
|
464
|
-
filename = "#{LogDir}/vereinfachte_detail_#{einfach.firstHit + position}.html"
|
465
|
-
writeResponse(filename)
|
466
|
-
matchResult = @agent.page.search('h1').text
|
467
|
-
unless /Detailansicht zu (Gesuch|Marke)/.match(matchResult)
|
468
|
-
puts matchResult
|
469
|
-
puts "Attention did not find 'Detailansicht' in #{filename}. Someting went wrong!"
|
470
|
-
break
|
471
|
-
end
|
472
|
-
@results << Swissreg::getMarkenInfoFromDetail(Nokogiri::Slop(@agent.page.body))
|
473
|
-
@agent.back
|
474
|
-
sleep 1
|
475
|
-
}
|
481
|
+
@all_trademark_numbers += Swissreg::getTrademarkNumbers(doc)
|
482
|
+
|
476
483
|
filename = "#{LogDir}/vereinfachte_#{pageNr}_back.html"
|
477
484
|
writeResponse(filename)
|
478
|
-
if pageNr < (einfach.nrSubPages
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
end
|
485
|
-
|
485
|
+
if pageNr < (einfach.nrSubPages)
|
486
|
+
Swissreg::setAllInputValue(@agent.page.forms.first, data2)
|
487
|
+
@agent.page.forms.first.submit
|
488
|
+
getAllHits(nil, subPage2Fetch)
|
489
|
+
end
|
490
|
+
@all_trademark_numbers
|
486
491
|
end
|
487
492
|
|
488
493
|
def fetchresult(filename = "#{LogDir}/fetch_1.html", counter = 1)
|
@@ -503,11 +508,33 @@ module Brand2csv
|
|
503
508
|
filename = "#{LogDir}/vereinfacht.html"
|
504
509
|
writeResponse(filename)
|
505
510
|
end
|
506
|
-
getAllHits(
|
511
|
+
getAllHits(doc, counter)
|
512
|
+
puts"getAllHits: returned #{@all_trademark_numbers ? @all_trademark_numbers.size : 0} hits "
|
513
|
+
if @all_trademark_numbers
|
514
|
+
@all_trademark_numbers.each{
|
515
|
+
|nr|
|
516
|
+
nrRetries = 0
|
517
|
+
begin
|
518
|
+
fetchDetails(nr)
|
519
|
+
rescue Exception => e
|
520
|
+
nrRetries += 1
|
521
|
+
puts e.backtrace
|
522
|
+
puts "fetchDetails did not work reinit session and retry for #{nr}. nrRetries #{nrRetries}/3. e #{e}"
|
523
|
+
if nrRetries <= 3
|
524
|
+
init_swissreg
|
525
|
+
retry
|
526
|
+
else
|
527
|
+
raise Interrupt
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
}
|
532
|
+
else
|
533
|
+
puts "Could not find any trademarks in #{filename}"
|
534
|
+
end
|
507
535
|
end
|
508
|
-
|
509
536
|
end # class Swissreg
|
510
|
-
|
537
|
+
|
511
538
|
def Brand2csv::run(timespan, marke = 'a*')
|
512
539
|
session = Swissreg.new(timespan, marke)
|
513
540
|
begin
|
@@ -517,6 +544,7 @@ module Brand2csv
|
|
517
544
|
puts "Unterbrochen. Vesuche #{session.results.size} Resultate zu speichern"
|
518
545
|
end
|
519
546
|
Swissreg::emitCsv(session.results, "#{timespan}.csv")
|
547
|
+
session.results
|
520
548
|
end
|
521
549
|
|
522
550
|
end # module Brand2csv
|
data/lib/brand2csv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: brand2csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
12
|
+
date: 2013-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -148,7 +148,6 @@ files:
|
|
148
148
|
- spike_mechanize_swissreg.rb
|
149
149
|
- spike_watir.rb
|
150
150
|
- swissreg.rb
|
151
|
-
- .gemtest
|
152
151
|
homepage: https://github.com/zdavatz/brand2csv
|
153
152
|
licenses: []
|
154
153
|
post_install_message:
|
data/.gemtest
DELETED
File without changes
|