brand2csv 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -5
- data/History.txt +6 -0
- data/lib/brand2csv.rb +87 -59
- data/lib/brand2csv/version.rb +1 -1
- metadata +2 -3
- data/.gemtest +0 -0
data/Gemfile
CHANGED
@@ -1,9 +1,5 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
|
-
gem 'mechanize'
|
3
|
-
gem 'mechanize', :path => '/opt/src/mechanize/pkg' if false
|
4
|
-
git 'git://github.com/sparklemotion/mechanize.git' do #, :tag => 'v0.13.4'
|
5
|
-
gem 'mechanize'
|
6
|
-
end if false
|
2
|
+
gem 'mechanize'
|
7
3
|
gem 'json', '~> 1.7.7'
|
8
4
|
gem 'nokogiri'
|
9
5
|
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.2.2 05.06.2013
|
2
|
+
|
3
|
+
* Fixed (re-)opening of session
|
4
|
+
* Reorganized for long run. Still errors on reconnect
|
5
|
+
* Added real fetch from swissreg to specs
|
6
|
+
|
1
7
|
=== 0.2.1 02.06.2013
|
2
8
|
|
3
9
|
* Fixed problems with ampersands. Emit clear error when swisssreg.ch not responding correctly
|
data/lib/brand2csv.rb
CHANGED
@@ -33,6 +33,7 @@ module Brand2csv
|
|
33
33
|
Sr3 = "#{Base_uri}/srclient/faces/jsp/trademark/sr3.jsp"
|
34
34
|
Sr30 = "#{Base_uri}/srclient/faces/jsp/trademark/sr30.jsp"
|
35
35
|
Sr300 = "#{Base_uri}/srclient/faces/jsp/trademark/sr300.jsp"
|
36
|
+
DetailRegexp = /d_swissreg:mainContent:data:(\d*):tm_no_detail:id_detail/i
|
36
37
|
AddressRegexp = /^(\d\d\d\d)\W*(.*)/
|
37
38
|
LineSplit = ', '
|
38
39
|
DefaultCountry = 'Schweiz'
|
@@ -84,7 +85,7 @@ module Brand2csv
|
|
84
85
|
|
85
86
|
MaxZeilen = 5
|
86
87
|
HitsPerPage = 250
|
87
|
-
LogDir = '
|
88
|
+
LogDir = 'log'
|
88
89
|
|
89
90
|
attr_accessor :marke, :results, :timespan
|
90
91
|
|
@@ -92,14 +93,8 @@ module Brand2csv
|
|
92
93
|
@timespan = timespan
|
93
94
|
@marke = marke
|
94
95
|
@number = nil
|
95
|
-
|
96
|
-
@agent = Mechanize.new { |agent|
|
97
|
-
agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
|
98
|
-
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
99
|
-
FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
|
100
|
-
agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
|
101
|
-
}
|
102
96
|
@results = []
|
97
|
+
@all_trademark_numbers = []
|
103
98
|
@errors = Hash.new
|
104
99
|
@lastDetail =nil
|
105
100
|
@counterDetails = 0
|
@@ -131,11 +126,15 @@ module Brand2csv
|
|
131
126
|
|
132
127
|
UseClick = false
|
133
128
|
|
134
|
-
|
135
|
-
|
136
|
-
nummer =@number) # nummer = "559271" ergibt genau einen treffer
|
137
|
-
|
129
|
+
# Initialize a session with swissreg and save the cookie as @state
|
130
|
+
def init_swissreg
|
138
131
|
begin
|
132
|
+
@agent = Mechanize.new { |agent|
|
133
|
+
agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
|
134
|
+
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
135
|
+
FileUtils.makedirs(LogDir) if $VERBOSE or defined?(RSpec)
|
136
|
+
agent.log = Logger.new("#{LogDir}/mechanize.log") if $VERBOSE
|
137
|
+
}
|
139
138
|
@agent.get_file Start_uri # 'https://www.swissreg.ch/srclient/faces/jsp/start.jsp'
|
140
139
|
writeResponse("#{LogDir}/session_expired.html")
|
141
140
|
checkErrors(@agent.page.body, false)
|
@@ -146,6 +145,13 @@ module Brand2csv
|
|
146
145
|
puts "Net::HTTPInternalServerError oder Mechanize::ResponseCodeError gesehen.\n #{Base_uri} hat wahrscheinlich Probleme"
|
147
146
|
exit 3
|
148
147
|
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
|
151
|
+
marke = @marke,
|
152
|
+
nummer =@number) # nummer = "559271" ergibt genau einen treffer
|
153
|
+
|
154
|
+
init_swissreg
|
149
155
|
data = [
|
150
156
|
["autoScroll", "0,0"],
|
151
157
|
["id_swissreg:_link_hidden_", ""],
|
@@ -303,7 +309,6 @@ module Brand2csv
|
|
303
309
|
bezeichnung = nil
|
304
310
|
inhaber = nil
|
305
311
|
hinterlegungsdatum = nil
|
306
|
-
zeilen = []
|
307
312
|
doc.xpath("//html/body/form/div/div/fieldset/div/table/tbody/tr").each{
|
308
313
|
|x|
|
309
314
|
if x.children.first.text.eql?('Marke')
|
@@ -314,9 +319,8 @@ module Brand2csv
|
|
314
319
|
bezeichnung = x.children[1].text
|
315
320
|
end
|
316
321
|
end
|
322
|
+
|
317
323
|
if x.children.first.text.eql?('Inhaber/in')
|
318
|
-
# inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit).gsub('&', '&')
|
319
|
-
# x.children[1].children.each{ |child| zeilen << child.text.gsub('&', '&') unless child.text.length == 0 } # avoid adding <br>
|
320
324
|
inhaber = />(.*)<\/td/.match(x.children[1].to_s)[1].gsub('<br>',LineSplit)
|
321
325
|
end
|
322
326
|
hinterlegungsdatum = x.children[1].text if x.children.first.text.eql?('Hinterlegungsdatum')
|
@@ -326,6 +330,26 @@ module Brand2csv
|
|
326
330
|
marke = Marke.new(bezeichnung, number, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
|
327
331
|
end
|
328
332
|
|
333
|
+
def fetchDetails(nummer) # takes a long time!
|
334
|
+
@counterDetails += 1
|
335
|
+
init_swissreg if @counterDetails % 90 == 0 # it seems that swissreg is artificially slowing down serving request after 100 hits
|
336
|
+
filename = "#{LogDir}/detail_#{sprintf('%05d', @counterDetails)}_#{nummer.gsub('/','.')}.html"
|
337
|
+
if File.exists?(filename)
|
338
|
+
doc = Nokogiri::Slop(File.open(filename))
|
339
|
+
else
|
340
|
+
url = "#{Sr300}?language=de§ion=tm&id=#{nummer}"
|
341
|
+
pp "#{Time.now.strftime("%H:%M:%S")}: Opening #{filename}" if $VERBOSE
|
342
|
+
$stdout.flush
|
343
|
+
content = @agent.get_file url
|
344
|
+
body = @agent.page.body
|
345
|
+
body.force_encoding('utf-8') unless /^1\.8/.match(RUBY_VERSION)
|
346
|
+
doc = Nokogiri::Slop(body)
|
347
|
+
writeResponse(filename)
|
348
|
+
end
|
349
|
+
marke = Swissreg::getMarkenInfoFromDetail(doc)
|
350
|
+
@results << marke
|
351
|
+
end
|
352
|
+
|
329
353
|
def Swissreg::emitCsv(results, filename='ausgabe.csv')
|
330
354
|
return if results == nil or results.size == 0
|
331
355
|
if /^1\.8/.match(RUBY_VERSION)
|
@@ -360,6 +384,17 @@ module Brand2csv
|
|
360
384
|
end
|
361
385
|
end
|
362
386
|
|
387
|
+
def Swissreg::getTrademarkNumbers(doc)
|
388
|
+
trademark_numbers = []
|
389
|
+
doc.search('a').each{
|
390
|
+
|link|
|
391
|
+
if DetailRegexp.match(link.attribute('id'))
|
392
|
+
trademark_numbers << link.children.first.children.first.content
|
393
|
+
end
|
394
|
+
}
|
395
|
+
trademark_numbers
|
396
|
+
end
|
397
|
+
|
363
398
|
class Swissreg::Vereinfachte
|
364
399
|
attr_reader :links2details, :trademark_search_id, :inputData, :firstHit, :nrHits, :nrSubPages, :pageNr
|
365
400
|
HitRegexpDE = /Seite (\d*) von ([\d']*) - Treffer ([\d']*)-([\d']*) von ([\d']*)/
|
@@ -384,7 +419,7 @@ module Brand2csv
|
|
384
419
|
@state = Swissreg::inputValue(Swissreg::getInputValuesFromPage(doc), 'javax.faces.ViewState')
|
385
420
|
doc.search('a').each{
|
386
421
|
|link|
|
387
|
-
if m =
|
422
|
+
if m = DetailRegexp.match(link.attribute('id'))
|
388
423
|
# puts "XXX #{link.attribute('onclick').to_s} href: #{link.attribute('href').to_s} value #{link.attribute('value').to_s}" if $VERBOSE
|
389
424
|
m = /'tmMainId','(\d*)'/.match(link.attribute('onclick').to_s)
|
390
425
|
tmMainId = m[1].to_i
|
@@ -422,7 +457,7 @@ module Brand2csv
|
|
422
457
|
end
|
423
458
|
|
424
459
|
end
|
425
|
-
|
460
|
+
|
426
461
|
def getAllHits(filename = nil, pageNr = 1)
|
427
462
|
if filename && File.exists?(filename)
|
428
463
|
doc = Nokogiri::Slop(File.open(filename))
|
@@ -435,7 +470,7 @@ module Brand2csv
|
|
435
470
|
end
|
436
471
|
|
437
472
|
einfach = Swissreg::Vereinfachte.new(doc)
|
438
|
-
puts "#{Time.now.strftime("%H:%M:%S")} status:
|
473
|
+
puts "#{Time.now.strftime("%H:%M:%S")} status: getAllHits for #{pageNr} of #{einfach.nrSubPages} pages" if $VERBOSE
|
439
474
|
subPage2Fetch = pageNr + 1
|
440
475
|
data2 = einfach.getPostDataForSubpage(subPage2Fetch).clone
|
441
476
|
if (HitsPerPage < einfach.nrHits - einfach.firstHit)
|
@@ -443,46 +478,16 @@ module Brand2csv
|
|
443
478
|
else
|
444
479
|
itemsToFetch = einfach.nrHits - einfach.firstHit
|
445
480
|
end
|
446
|
-
|
447
|
-
|
448
|
-
id = einfach.links2details[position]
|
449
|
-
nextId = einfach.firstHit.to_i - 1 + position.to_i
|
450
|
-
data3 = einfach.getPostDataForDetail(nextId, id)
|
451
|
-
Swissreg::setAllInputValue(@agent.page.forms.first, data3)
|
452
|
-
nrTries = 1
|
453
|
-
while true
|
454
|
-
begin
|
455
|
-
@agent.page.forms.first.submit
|
456
|
-
break
|
457
|
-
rescue
|
458
|
-
puts "Rescue in submit. nrTries is #{nrTries}. Retry after a few seconds"
|
459
|
-
nrTries += 1
|
460
|
-
sleep 10
|
461
|
-
exit 1 if nrTries > 3
|
462
|
-
end
|
463
|
-
end
|
464
|
-
filename = "#{LogDir}/vereinfachte_detail_#{einfach.firstHit + position}.html"
|
465
|
-
writeResponse(filename)
|
466
|
-
matchResult = @agent.page.search('h1').text
|
467
|
-
unless /Detailansicht zu (Gesuch|Marke)/.match(matchResult)
|
468
|
-
puts matchResult
|
469
|
-
puts "Attention did not find 'Detailansicht' in #{filename}. Someting went wrong!"
|
470
|
-
break
|
471
|
-
end
|
472
|
-
@results << Swissreg::getMarkenInfoFromDetail(Nokogiri::Slop(@agent.page.body))
|
473
|
-
@agent.back
|
474
|
-
sleep 1
|
475
|
-
}
|
481
|
+
@all_trademark_numbers += Swissreg::getTrademarkNumbers(doc)
|
482
|
+
|
476
483
|
filename = "#{LogDir}/vereinfachte_#{pageNr}_back.html"
|
477
484
|
writeResponse(filename)
|
478
|
-
if pageNr < (einfach.nrSubPages
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
end
|
485
|
-
|
485
|
+
if pageNr < (einfach.nrSubPages)
|
486
|
+
Swissreg::setAllInputValue(@agent.page.forms.first, data2)
|
487
|
+
@agent.page.forms.first.submit
|
488
|
+
getAllHits(nil, subPage2Fetch)
|
489
|
+
end
|
490
|
+
@all_trademark_numbers
|
486
491
|
end
|
487
492
|
|
488
493
|
def fetchresult(filename = "#{LogDir}/fetch_1.html", counter = 1)
|
@@ -503,11 +508,33 @@ module Brand2csv
|
|
503
508
|
filename = "#{LogDir}/vereinfacht.html"
|
504
509
|
writeResponse(filename)
|
505
510
|
end
|
506
|
-
getAllHits(
|
511
|
+
getAllHits(doc, counter)
|
512
|
+
puts"getAllHits: returned #{@all_trademark_numbers ? @all_trademark_numbers.size : 0} hits "
|
513
|
+
if @all_trademark_numbers
|
514
|
+
@all_trademark_numbers.each{
|
515
|
+
|nr|
|
516
|
+
nrRetries = 0
|
517
|
+
begin
|
518
|
+
fetchDetails(nr)
|
519
|
+
rescue Exception => e
|
520
|
+
nrRetries += 1
|
521
|
+
puts e.backtrace
|
522
|
+
puts "fetchDetails did not work reinit session and retry for #{nr}. nrRetries #{nrRetries}/3. e #{e}"
|
523
|
+
if nrRetries <= 3
|
524
|
+
init_swissreg
|
525
|
+
retry
|
526
|
+
else
|
527
|
+
raise Interrupt
|
528
|
+
end
|
529
|
+
end
|
530
|
+
|
531
|
+
}
|
532
|
+
else
|
533
|
+
puts "Could not find any trademarks in #{filename}"
|
534
|
+
end
|
507
535
|
end
|
508
|
-
|
509
536
|
end # class Swissreg
|
510
|
-
|
537
|
+
|
511
538
|
def Brand2csv::run(timespan, marke = 'a*')
|
512
539
|
session = Swissreg.new(timespan, marke)
|
513
540
|
begin
|
@@ -517,6 +544,7 @@ module Brand2csv
|
|
517
544
|
puts "Unterbrochen. Vesuche #{session.results.size} Resultate zu speichern"
|
518
545
|
end
|
519
546
|
Swissreg::emitCsv(session.results, "#{timespan}.csv")
|
547
|
+
session.results
|
520
548
|
end
|
521
549
|
|
522
550
|
end # module Brand2csv
|
data/lib/brand2csv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: brand2csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-06-
|
12
|
+
date: 2013-06-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -148,7 +148,6 @@ files:
|
|
148
148
|
- spike_mechanize_swissreg.rb
|
149
149
|
- spike_watir.rb
|
150
150
|
- swissreg.rb
|
151
|
-
- .gemtest
|
152
151
|
homepage: https://github.com/zdavatz/brand2csv
|
153
152
|
licenses: []
|
154
153
|
post_install_message:
|
data/.gemtest
DELETED
File without changes
|