brand2csv 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ .gitignore
2
+ .rspec
3
+ Gemfile
4
+ Gemfile.lock
5
+ History.txt
6
+ LICENCE.txt
7
+ LICENSE
8
+ Manifest.txt
9
+ README.md
10
+ Rakefile
11
+ lib/brand2csv.rb
12
+ lib/brand2csv/version.rb
13
+ protocol.2013.05.12.textile
14
+ protocol.2013.05.15.textile
15
+ protocol.2013.05.21.textile
16
+ resultat_1.html
17
+ spike.rb
18
+ spike_mechanize_swissreg.rb
19
+ spike_watir.rb
@@ -0,0 +1,6 @@
1
+ # brand2csv
2
+
3
+ brand2csv
4
+ =========
5
+
6
+ brand2csv using swissreg.ch
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'rubygems'
5
+ require 'hoe'
6
+
7
+ Hoe.spec "brand2csv" do
8
+ self.author = "Niklaus Giger, Zeno R.R. Davatz" # gem.authors
9
+ self.email = "yasaka@ywesee.com, zdavatz@ywesee.com"
10
+ self.description = "brand2csv creates csv files for swiss brand registered in a specific time period.
11
+ The csv contains the brand, link to image (if present), link to the detailinfo at swissreg.ch, name and address of owner (Inhaber)"
12
+ self.summary = "brand2csv creates csv files for swiss brands."
13
+ self.urls = ["https://github.com/zdavatz/brand2csv"] # gem.homepage
14
+
15
+ # gem.add_runtime_dependency
16
+ self.extra_deps << ['mechanize', '>= 2.6']
17
+
18
+ # gem.add_development_dependency
19
+ self.extra_dev_deps << ['rspec']
20
+ self.extra_dev_deps << ['webmock']
21
+ self.extra_dev_deps << ['hoe', '>= 3.4']
22
+ self.extra_dev_deps << ['rdoc']
23
+ end
@@ -0,0 +1,341 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require "brand2csv/version"
5
+ require 'mechanize'
6
+ require 'prettyprint'
7
+ require 'optparse'
8
+ require 'csv'
9
+
10
+ module Brand2csv
11
+
12
+ class Marke < Struct.new(:name, :markennummer, :inhaber, :land, :hinterlegungsdatum, :zeile_1, :zeile_2, :zeile_3, :zeile_4, :zeile_5, :plz, :ort)
13
+ end
14
+
15
+ class Swissreg
16
+
17
+ # Weitere gesehene Fehler
18
+ BekannteFehler =
19
+ ['Das Datum ist ung', # ültig'
20
+ 'Erweiterte Suche',
21
+ 'Vereinfachte Trefferliste anzeigen',
22
+ 'Es wurden keine Daten gefunden.',
23
+ 'Die Suchkriterien sind teilweise unzul', # ässig',
24
+ 'Geben Sie mindestens ein Suchkriterium ein',
25
+ 'Die Suche wurde abgebrochen, da die maximale Suchzeit von 60 Sekunden',
26
+ ]
27
+ Base_uri = 'https://www.swissreg.ch'
28
+ Start_uri = "#{Base_uri}/srclient/faces/jsp/start.jsp"
29
+ AddressRegexp = /^(\d\d\d\d)\W*(.*)/
30
+ LineSplit = ', '
31
+ DefaultCountry = 'Schweiz'
32
+ # Angezeigte Spalten "id_swissreg:mainContent:id_ckbTMChoice"
33
+ TMChoiceFields = [
34
+ "tm_lbl_tm_text", # Marke
35
+ # "tm_lbl_state"], # Status
36
+ # "tm_lbl_nizza_class"], # Nizza Klassifikation Nr.
37
+ # "tm_lbl_no"], # disabled="disabled"], # Nummer
38
+ "tm_lbl_applicant", # Inhaber/in
39
+ "tm_lbl_country", # Land (Inhaber/in)
40
+ # "tm_lbl_agent", # Vertreter/in
41
+ # "tm_lbl_licensee"], # Lizenznehmer/in
42
+ "tm_lbl_app_date", # Hinterlegungsdatum
43
+ ]
44
+
45
+ attr_accessor :marke
46
+
47
+ def initialize(timespan)
48
+ @timespan = timespan
49
+ @agent = Mechanize.new { |agent|
50
+ # agent.user_agent_alias = 'Mac Safari'
51
+ agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
52
+ # agent.redirection_limit = 5
53
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
54
+ }
55
+ @results = []
56
+ @errors = Hash.new
57
+ @lastResponse = nil
58
+ @lastDetail =nil
59
+ @counterDetails = 0
60
+ @marke = 'zzzyyzzzzyzzyz*' # => Fehlermeldung: Es wurden keine Daten gefunden
61
+ # asp* => 138 records werden geholt
62
+ # a* => Es wurden 25,490 Treffer gefunden. Davon werden 10000 zufällig ausgewählte Schutztitel angezeigt. Bitte schränken Sie Ihre Suche weiter ein.
63
+ # Ab 501 Treffer wird eine vereinfachte Trefferliste angezeigt.
64
+ # asp* => 138 records werden geholt
65
+
66
+ @marke = nil # => Fehlermeldung: Geben Sie mindestens ein Suchkriterium ein
67
+ @marke = 'asp*'
68
+ @number = '500000'
69
+ @number = nil
70
+ # @marke = "*WEIH*"
71
+ @hitsPerPage = 100
72
+ end
73
+
74
+ def writeResponse(filename, body)
75
+ if defined?(RSpec)
76
+ ausgabe = File.open(filename, 'w+')
77
+ ausgabe.puts body
78
+ ausgabe.close
79
+ else
80
+ puts "Skipping writing #{filename}" if $VERBOSE
81
+ end
82
+ end
83
+
84
+ def view_state(response)
85
+ if match = /javax.faces.ViewState.*?value="([^"]+)"/u.match(response.force_encoding('utf-8'))
86
+ match[1]
87
+ else
88
+ ""
89
+ end
90
+ end
91
+
92
+ def checkErrors(body)
93
+ BekannteFehler.each {
94
+ |errMsg|
95
+ if body.to_s.index(errMsg)
96
+ puts "Tut mir leid. Suche wurde mit Fehlermeldung <#{errMsg}> abgebrochen."
97
+ exit 2
98
+ end
99
+ }
100
+ end
101
+
102
+ def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
103
+ marke = @marke,
104
+ nummer =@number) # nummer = "559271" ergibt genau einen treffer
105
+ @agent.get Start_uri # get a cookie for the session
106
+ content = @agent.get_file Start_uri
107
+ FileUtils.makedirs 'mechanize'
108
+ writeResponse('mechanize/main.html', content)
109
+ @state = view_state(content)
110
+ data = [
111
+ ["autoScroll", "0,0"],
112
+ ["id_swissreg:_link_hidden_", ""],
113
+ ["id_swissreg_SUBMIT", "1"],
114
+ ["id_swissreg:_idcl", "id_swissreg_sub_nav_ipiNavigation_item0"],
115
+ ["javax.faces.ViewState", @state],
116
+ ]
117
+
118
+ content = @agent.post(Start_uri, data)
119
+ writeResponse('mechanize/einfache_suche.html', content.body)
120
+
121
+ data = [
122
+ ["autoScroll", "0,0"],
123
+ ["id_swissreg:_link_hidden_", ""],
124
+ ["id_swissreg_SUBMIT", "1"],
125
+ ["id_swissreg:_idcl", "id_swissreg_sub_nav_ipiNavigation_item0_item3"],
126
+ ["javax.faces.ViewState", @state],
127
+ ]
128
+ # sr1 ist die einfache suche, sr3 die erweiterte Suche
129
+ @path = "/srclient/faces/jsp/trademark/sr3.jsp"
130
+ response = @agent.post(Base_uri + @path, data)
131
+ writeResponse('mechanize/erweiterte_suche.html', response.body)
132
+ # Bis hier alles okay
133
+ @criteria = [
134
+ ["autoScroll", "0,829"],
135
+ ["id_swissreg:_link_hidden_", ""],
136
+ ["id_swissreg:mainContent:id_ckbTMState", "1"], # "Hängige Gesuche 1
137
+ # ["id_swissreg:mainContent:id_ckbTMState", "2"], # "Gelöschte Gesuche 2
138
+ ["id_swissreg:mainContent:id_ckbTMState", "3"], # aktive Marken 3
139
+ # ["id_swissreg:mainContent:id_ckbTMState", "4"], # gelöschte Marken 4
140
+ ["id_swissreg:mainContent:id_cbxCountry", "_ALL"], # Auswahl Länder _ALL
141
+ # ["id_swissreg:mainContent:id_txf_tm_no", ""], # Marken Nr
142
+ ["id_swissreg:mainContent:id_txf_tm_no", nummer],# Marken Nr
143
+ ["id_swissreg:mainContent:id_txf_app_no", ""], # Gesuch Nr.
144
+ ["id_swissreg:mainContent:id_txf_tm_text", marke],
145
+ ["id_swissreg:mainContent:id_txf_applicant", ""], # Inhaber/in
146
+ ["id_swissreg:mainContent:id_txf_agent", ""], # Vertreter/in
147
+ ["id_swissreg:mainContent:id_txf_licensee", ""], # Lizenznehmer
148
+ ["id_swissreg:mainContent:id_txf_nizza_class", ""], # Nizza Klassifikation Nr.
149
+ # ["id_swissreg:mainContent:id_txf_appDate", timespan], # Hinterlegungsdatum
150
+ ["id_swissreg:mainContent:id_txf_appDate", timespan] ,
151
+ ["id_swissreg:mainContent:id_txf_expiryDate", ""], # Ablauf Schutzfrist
152
+ # Markenart: Individualmarke 1 Kollektivmarke 2 Garantiemarke 3
153
+ ["id_swissreg:mainContent:id_cbxTMTypeGrp", "_ALL"], # Markenart
154
+ ["id_swissreg:mainContent:id_cbxTMForm", "_ALL"], # Markentyp
155
+ ["id_swissreg:mainContent:id_cbxTMColorClaim", "_ALL"], # Farbanspruch
156
+ ["id_swissreg:mainContent:id_txf_pub_date", ""], # Publikationsdatum
157
+
158
+ # info zu Publikationsgrund id_swissreg:mainContent:id_ckbTMPubReason
159
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "1"], #Neueintragungen
160
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "2"], #Berichtigungen
161
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "3"], #Verlängerungen
162
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "4"], #Löschungen
163
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "5"], #Inhaberänderungen
164
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "6"], #Vertreteränderungen
165
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "7"], #Lizenzänderungen
166
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "8"], #Weitere Registeränderungen
167
+ # ["id_swissreg:mainContent:id_ckbTMEmptyHits", "0"], # Leere Trefferliste anzeigen
168
+
169
+ # "id_swissreg:mainContent:id_cbxFormatChoice" 2 = Publikationsansicht 1 = Registeransicht
170
+ ["id_swissreg:mainContent:id_cbxFormatChoice", "1"],
171
+ ["id_swissreg:mainContent:id_cbxHitsPerPage", @hitsPerPage], # Treffer pro Seite
172
+ ]
173
+ TMChoiceFields.each{ | field2display| @criteria << ["id_swissreg:mainContent:id_ckbTMChoice", field2display] }
174
+ # id_swissreg:mainContent:id_ckbTMChoice tm_lbl_tm_text
175
+ puts "Marke ist #{marke}" if marke # Wortlaut der Marke
176
+ puts "Hinterlegungsdatum ist #{timespan}" if $VERBOSE and timespan
177
+ puts "nummer ist #{timespan}" if nummer
178
+ @criteria << ["id_swissreg:mainContent:sub_fieldset:id_submit", "suchen"]
179
+ @criteria << ["id_swissreg_SUBMIT", "1"]
180
+ @criteria << ["id_swissreg:_idcl", ""]
181
+ @criteria << ["id_swissreg:_link_hidden_", ""]
182
+ @criteria << ["javax.faces.ViewState", @state]
183
+
184
+ @path = "/srclient/faces/jsp/trademark/sr3.jsp"
185
+ response = @agent.post(Base_uri + @path, @criteria)
186
+ writeResponse('mechanize/resultate_1.html', response.body)
187
+ checkErrors(response.body)
188
+ @lastResponse = response
189
+ end
190
+
191
+ def parseAddress(nummer, inhaber)
192
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5 = inhaber.split(LineSplit)
193
+ ort = nil
194
+ plz = nil
195
+ if m = AddressRegexp.match(zeile_2)
196
+ zeile_2 = nil
197
+ plz = m[1]; ort = m[2]
198
+ elsif m = AddressRegexp.match(zeile_3)
199
+ zeile_3 = nil
200
+ plz = m[1]; ort = m[2]
201
+ elsif m = AddressRegexp.match(zeile_4)
202
+ zeile_4 = nil
203
+ plz = m[1]; ort = m[2]
204
+ elsif m = AddressRegexp.match(zeile_5)
205
+ zeile_5 = nil
206
+ plz = m[1]; ort = m[2]
207
+ else
208
+ puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{inhaber} nicht parsen" if $VERBOSE
209
+ return nil, nil, nil, nil, nil, nil, nil, nil
210
+ end
211
+ return zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort
212
+ end
213
+
214
+ def fetchDetails(nummer) # takes a long time!
215
+ @counterDetails += 1
216
+ filename = "mechanize/detail_#{nummer}.html"
217
+ if File.exists?(filename)
218
+ doc = Nokogiri::Slop(File.open(filename))
219
+ else
220
+ url = "https://www.swissreg.ch/srclient/faces/jsp/trademark/sr300.jsp?language=de&section=tm&id=#{nummer}"
221
+ pp "Opening #{url}" if $VERBOSE
222
+ content = @agent.get_file url
223
+ writeResponse("mechanize/detail_#{nummer}.html", content)
224
+ doc = Nokogiri::Slop(content)
225
+ end
226
+ puts "Bitte um Geduld. Hole Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
227
+ path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
228
+ counter = 0
229
+ doc.xpath(path_name).each{
230
+ |td|
231
+ pp "#{counter}: #{td.text}" if $VERBOSE
232
+ counter += 1
233
+ next unless /^inhaber/i.match(td.text)
234
+ zeilen = []
235
+ doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text.gsub(LineSplit,'. ') unless child.text.length == 0 } # avoid adding <br>
236
+ if info = @errors[nummer]
237
+ info.inhaber = zeilen.join(LineSplit)
238
+ info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, info.inhaber)
239
+ @results << info
240
+ else
241
+ bezeichnung = doc.xpath(path_name)[15]
242
+ inhaber = zeilen.join(LineSplit)
243
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
244
+ hinterlegungsdatum = doc.xpath(path_name)[7]
245
+ marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
246
+ @results << marke
247
+ end
248
+ }
249
+ end
250
+
251
+ def fetchresult(filename = nil, counter = 1)
252
+ if filename
253
+ doc = Nokogiri::Slop(File.open(filename))
254
+ else
255
+ doc = Nokogiri::Slop(@lastResponse.body)
256
+ end
257
+ nrFailures = 0
258
+ counter += 1
259
+ puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'"
260
+ path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
261
+ hasNext = false
262
+ doc.xpath(path_name).each{
263
+ |elem|
264
+ if /scroll_1idx#{counter}/.match(elem.to_s)
265
+ hasNext = true
266
+ break
267
+ end
268
+ }
269
+ path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tbody/tr"
270
+ doc.xpath(path_name).each{
271
+ |elem|
272
+ bezeichnung = elem.elements[1].text
273
+ land = elem.elements[4].text
274
+ next unless /#{DefaultCountry}/i.match(land)
275
+ inhaber = elem.elements[3].text
276
+ nummer = elem.elements[2].text
277
+ if bezeichnung.length == 0
278
+ bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
279
+ end
280
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
281
+ if zeile_1
282
+ @results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
283
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
284
+ else
285
+ nrFailures += 1
286
+ @errors[nummer] = Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
287
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
288
+ end
289
+ } if doc.xpath(path_name)
290
+ if hasNext
291
+ @path = "/srclient/faces/jsp/trademark/sr30.jsp"
292
+ puts "Calling sub #{counter} with #{@path}" if $VERBOSE
293
+ data = [
294
+ ["autoScroll", "0,0"],
295
+ ["id_swissreg:mainContent:id_sub_options_result:sub_fieldset:id_cbxHitsPerPage", @hitsPerPage],
296
+ # ["id_swissreg:mainContent:vivian", "TRADEMARK REGISTER SEARCH TIMES: QUERY=[20] SELECT=[823] SERVER=[846] DELEGATE=[861] (HITS=[96])"],
297
+ ["id_swissreg_SUBMIT", "1"],
298
+ ["id_swissreg:_idcl", "id_swissreg:mainContent:scroll_1idx#{counter}"],
299
+ ["id_swissreg:mainContent:scroll_1", "idx#{counter}"],
300
+ ["tmMainId", ""],
301
+ ["id_swissreg:_link_hidden_ "],
302
+ ["javax.faces.ViewState", @state],
303
+ ]
304
+ TMChoiceFields.each{ | field2display| data << ["id_swissreg:mainContent:id_sub_options_result:id_ckbTMChoice", field2display] }
305
+ response = @agent.post(Base_uri + @path, data)
306
+ writeResponse("mechanize/resultate_#{counter}.html", response.body)
307
+ checkErrors(response.body)
308
+ @lastResponse = response
309
+ fetchresult(nil, counter)
310
+ else
311
+ puts "Es gab #{nrFailures} Fehler beim Lesen von #{filename}" if $VERBOSE
312
+ puts "Fand #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'. Von #{@errors.size} muss die Adresse noch geholt werden."
313
+ end
314
+ end
315
+
316
+ def emitCsv(filename='ausgabe.csv')
317
+ return if @results.size == 0
318
+ CSV.open(filename, 'w', {:headers=>@results[0].members,
319
+ :write_headers => true}) do |csv|
320
+ @results.each{ |x| csv << x }
321
+ end
322
+ puts "Speicherte #{@results.size} gefunden Datensätze für die Zeitspanne '#{@timespan}' in #{filename}"
323
+ end
324
+
325
+ def fetchMissingDetails
326
+ @errors.each{
327
+ |markennummer, info|
328
+ fetchDetails(markennummer)
329
+ }
330
+ end
331
+ end # class Swissreg
332
+
333
+ def Brand2csv::run(timespan)
334
+ session = Swissreg.new(timespan)
335
+ session.parse_swissreg
336
+ session.fetchresult
337
+ session.fetchMissingDetails
338
+ session.emitCsv
339
+ end
340
+
341
+ end # module Brand2csv
@@ -0,0 +1,3 @@
1
+ module Brand2csv
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,56 @@
1
+ h3. started brand2csv (12 May 2013
2
+
3
+ * Added minimal files to create a Ruby gem
4
+ * Started a spike.rb to fetch some elements from swissreg.ch via mechanize
5
+ * To get familiar with mechanize used the google example
6
+ Had to replace @page.form_with(:name => 'f')@ by @page.form_with(:name => 'gbqf')@
7
+
8
+ * www.swissreg.ch must be opened with agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
9
+
10
+ * Examples of a link to details for a brand record are
11
+ bc. https://www.swissreg.ch/srclient/de/tm/61082/2011
12
+ https://www.swissreg.ch/srclient/de/tm/61082/2011
13
+ https://www.swissreg.ch/srclient/faces/jsp/trademark/sr300.jsp?language=en&section=tm&id=61082/2011
14
+
15
+ * Links
16
+ ** Marken Suchen https://www.swissreg.ch/srclient/faces/jsp/start.jsp
17
+ ** Erweitertete Suchen https://www.swissreg.ch/srclient/faces/jsp/trademark/sr1.jsp
18
+ ** Resultate der Detailsuche unter https://www.swissreg.ch/srclient/faces/jsp/trademark/sr3.jsp
19
+
20
+ Wasted some time to discover that swissreg.rb does not use mechanize, but URI and hpricot to fetch the patent registration.
21
+
22
+ With watir the following few lines sufficed to fetch a detail
23
+
24
+ bc. Swiss_reg_URL = 'https://www.swissreg.ch'
25
+ client = Selenium::WebDriver::Remote::Http::Default.new
26
+ browser = Watir::Browser.new :firefox
27
+ browser.goto Swiss_reg_URL
28
+ browser.link(:id, "id_swissreg_sub_nav_ipiNavigation_item0").click
29
+ browser.link(:id, "id_swissreg_sub_nav_ipiNavigation_item0_item3").click
30
+ browser.text_field(:id, "id_swissreg:mainContent:id_txf_appDate").set("1.10.2011-5.10.2011")
31
+ browser.button(:value,"suchen").click
32
+ browser.link(:id, "id_swissreg:mainContent:data:2:tm_no_detail:id_detail").click# puts browser.text
33
+
34
+ Was not able to create a spike using either mechnize or uri/hpricot to fetch the details.
35
+
36
+ * Thoughts about the CLI interface to csv
37
+
38
+ bc. brand2csh --help
39
+ Useage brand2csh 1.10.2011-5.10.2011 [name_of_brand]
40
+ Fetches brand records from swissreg for the given date range into results.csv.
41
+ Each result contains the following fields
42
+ - date of registration
43
+ - brandname
44
+ - owner of brand
45
+ -- name
46
+ -- addressline1
47
+ -- addressline2 (optional)
48
+ -- zip code
49
+ -- city
50
+ Only owners inside Switzerland will be returned.
51
+
52
+ * Would this be a good extension?
53
+ Accumulate all given results (+ temporary result like info_line_1..x) into a sqlite database.
54
+ Would allow an easy sql manipulation of data for filtering/sorting addresses, etc.
55
+
56
+