brand2csv 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ .gitignore
2
+ .rspec
3
+ Gemfile
4
+ Gemfile.lock
5
+ History.txt
6
+ LICENCE.txt
7
+ LICENSE
8
+ Manifest.txt
9
+ README.md
10
+ Rakefile
11
+ lib/brand2csv.rb
12
+ lib/brand2csv/version.rb
13
+ protocol.2013.05.12.textile
14
+ protocol.2013.05.15.textile
15
+ protocol.2013.05.21.textile
16
+ resultat_1.html
17
+ spike.rb
18
+ spike_mechanize_swissreg.rb
19
+ spike_watir.rb
@@ -0,0 +1,6 @@
1
+ # brand2csv
2
+
3
+ brand2csv
4
+ =========
5
+
6
+ brand2csv using swissreg.ch
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'rubygems'
5
+ require 'hoe'
6
+
7
+ Hoe.spec "brand2csv" do
8
+ self.author = "Niklaus Giger, Zeno R.R. Davatz" # gem.authors
9
+ self.email = "yasaka@ywesee.com, zdavatz@ywesee.com"
10
+ self.description = "brand2csv creates csv files for swiss brand registered in a specific time period.
11
+ The csv contains the brand, link to image (if present), link to the detailinfo at swissreg.ch, name and address of owner (Inhaber)"
12
+ self.summary = "brand2csv creates csv files for swiss brands."
13
+ self.urls = ["https://github.com/zdavatz/brand2csv"] # gem.homepage
14
+
15
+ # gem.add_runtime_dependency
16
+ self.extra_deps << ['mechanize', '>= 2.6']
17
+
18
+ # gem.add_development_dependency
19
+ self.extra_dev_deps << ['rspec']
20
+ self.extra_dev_deps << ['webmock']
21
+ self.extra_dev_deps << ['hoe', '>= 3.4']
22
+ self.extra_dev_deps << ['rdoc']
23
+ end
@@ -0,0 +1,341 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require "brand2csv/version"
5
+ require 'mechanize'
6
+ require 'prettyprint'
7
+ require 'optparse'
8
+ require 'csv'
9
+
10
+ module Brand2csv
11
+
12
+ class Marke < Struct.new(:name, :markennummer, :inhaber, :land, :hinterlegungsdatum, :zeile_1, :zeile_2, :zeile_3, :zeile_4, :zeile_5, :plz, :ort)
13
+ end
14
+
15
+ class Swissreg
16
+
17
+ # Weitere gesehene Fehler
18
+ BekannteFehler =
19
+ ['Das Datum ist ung', # ültig'
20
+ 'Erweiterte Suche',
21
+ 'Vereinfachte Trefferliste anzeigen',
22
+ 'Es wurden keine Daten gefunden.',
23
+ 'Die Suchkriterien sind teilweise unzul', # ässig',
24
+ 'Geben Sie mindestens ein Suchkriterium ein',
25
+ 'Die Suche wurde abgebrochen, da die maximale Suchzeit von 60 Sekunden',
26
+ ]
27
+ Base_uri = 'https://www.swissreg.ch'
28
+ Start_uri = "#{Base_uri}/srclient/faces/jsp/start.jsp"
29
+ AddressRegexp = /^(\d\d\d\d)\W*(.*)/
30
+ LineSplit = ', '
31
+ DefaultCountry = 'Schweiz'
32
+ # Angezeigte Spalten "id_swissreg:mainContent:id_ckbTMChoice"
33
+ TMChoiceFields = [
34
+ "tm_lbl_tm_text", # Marke
35
+ # "tm_lbl_state"], # Status
36
+ # "tm_lbl_nizza_class"], # Nizza Klassifikation Nr.
37
+ # "tm_lbl_no"], # disabled="disabled"], # Nummer
38
+ "tm_lbl_applicant", # Inhaber/in
39
+ "tm_lbl_country", # Land (Inhaber/in)
40
+ # "tm_lbl_agent", # Vertreter/in
41
+ # "tm_lbl_licensee"], # Lizenznehmer/in
42
+ "tm_lbl_app_date", # Hinterlegungsdatum
43
+ ]
44
+
45
+ attr_accessor :marke
46
+
47
+ def initialize(timespan)
48
+ @timespan = timespan
49
+ @agent = Mechanize.new { |agent|
50
+ # agent.user_agent_alias = 'Mac Safari'
51
+ agent.user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
52
+ # agent.redirection_limit = 5
53
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
54
+ }
55
+ @results = []
56
+ @errors = Hash.new
57
+ @lastResponse = nil
58
+ @lastDetail =nil
59
+ @counterDetails = 0
60
+ @marke = 'zzzyyzzzzyzzyz*' # => Fehlermeldung: Es wurden keine Daten gefunden
61
+ # asp* => 138 records werden geholt
62
+ # a* => Es wurden 25,490 Treffer gefunden. Davon werden 10000 zufällig ausgewählte Schutztitel angezeigt. Bitte schränken Sie Ihre Suche weiter ein.
63
+ # Ab 501 Treffer wird eine vereinfachte Trefferliste angezeigt.
64
+ # asp* => 138 records werden geholt
65
+
66
+ @marke = nil # => Fehlermeldung: Geben Sie mindestens ein Suchkriterium ein
67
+ @marke = 'asp*'
68
+ @number = '500000'
69
+ @number = nil
70
+ # @marke = "*WEIH*"
71
+ @hitsPerPage = 100
72
+ end
73
+
74
+ def writeResponse(filename, body)
75
+ if defined?(RSpec)
76
+ ausgabe = File.open(filename, 'w+')
77
+ ausgabe.puts body
78
+ ausgabe.close
79
+ else
80
+ puts "Skipping writing #{filename}" if $VERBOSE
81
+ end
82
+ end
83
+
84
+ def view_state(response)
85
+ if match = /javax.faces.ViewState.*?value="([^"]+)"/u.match(response.force_encoding('utf-8'))
86
+ match[1]
87
+ else
88
+ ""
89
+ end
90
+ end
91
+
92
+ def checkErrors(body)
93
+ BekannteFehler.each {
94
+ |errMsg|
95
+ if body.to_s.index(errMsg)
96
+ puts "Tut mir leid. Suche wurde mit Fehlermeldung <#{errMsg}> abgebrochen."
97
+ exit 2
98
+ end
99
+ }
100
+ end
101
+
102
+ def parse_swissreg(timespan = @timespan, # sollte 377 Treffer ergeben, für 01.06.2007-10.06.2007, 559271 wurde in diesem Zeitraum registriert
103
+ marke = @marke,
104
+ nummer =@number) # nummer = "559271" ergibt genau einen treffer
105
+ @agent.get Start_uri # get a cookie for the session
106
+ content = @agent.get_file Start_uri
107
+ FileUtils.makedirs 'mechanize'
108
+ writeResponse('mechanize/main.html', content)
109
+ @state = view_state(content)
110
+ data = [
111
+ ["autoScroll", "0,0"],
112
+ ["id_swissreg:_link_hidden_", ""],
113
+ ["id_swissreg_SUBMIT", "1"],
114
+ ["id_swissreg:_idcl", "id_swissreg_sub_nav_ipiNavigation_item0"],
115
+ ["javax.faces.ViewState", @state],
116
+ ]
117
+
118
+ content = @agent.post(Start_uri, data)
119
+ writeResponse('mechanize/einfache_suche.html', content.body)
120
+
121
+ data = [
122
+ ["autoScroll", "0,0"],
123
+ ["id_swissreg:_link_hidden_", ""],
124
+ ["id_swissreg_SUBMIT", "1"],
125
+ ["id_swissreg:_idcl", "id_swissreg_sub_nav_ipiNavigation_item0_item3"],
126
+ ["javax.faces.ViewState", @state],
127
+ ]
128
+ # sr1 ist die einfache suche, sr3 die erweiterte Suche
129
+ @path = "/srclient/faces/jsp/trademark/sr3.jsp"
130
+ response = @agent.post(Base_uri + @path, data)
131
+ writeResponse('mechanize/erweiterte_suche.html', response.body)
132
+ # Bis hier alles okay
133
+ @criteria = [
134
+ ["autoScroll", "0,829"],
135
+ ["id_swissreg:_link_hidden_", ""],
136
+ ["id_swissreg:mainContent:id_ckbTMState", "1"], # "Hängige Gesuche 1
137
+ # ["id_swissreg:mainContent:id_ckbTMState", "2"], # "Gelöschte Gesuche 2
138
+ ["id_swissreg:mainContent:id_ckbTMState", "3"], # aktive Marken 3
139
+ # ["id_swissreg:mainContent:id_ckbTMState", "4"], # gelöschte Marken 4
140
+ ["id_swissreg:mainContent:id_cbxCountry", "_ALL"], # Auswahl Länder _ALL
141
+ # ["id_swissreg:mainContent:id_txf_tm_no", ""], # Marken Nr
142
+ ["id_swissreg:mainContent:id_txf_tm_no", nummer],# Marken Nr
143
+ ["id_swissreg:mainContent:id_txf_app_no", ""], # Gesuch Nr.
144
+ ["id_swissreg:mainContent:id_txf_tm_text", marke],
145
+ ["id_swissreg:mainContent:id_txf_applicant", ""], # Inhaber/in
146
+ ["id_swissreg:mainContent:id_txf_agent", ""], # Vertreter/in
147
+ ["id_swissreg:mainContent:id_txf_licensee", ""], # Lizenznehmer
148
+ ["id_swissreg:mainContent:id_txf_nizza_class", ""], # Nizza Klassifikation Nr.
149
+ # ["id_swissreg:mainContent:id_txf_appDate", timespan], # Hinterlegungsdatum
150
+ ["id_swissreg:mainContent:id_txf_appDate", timespan] ,
151
+ ["id_swissreg:mainContent:id_txf_expiryDate", ""], # Ablauf Schutzfrist
152
+ # Markenart: Individualmarke 1 Kollektivmarke 2 Garantiemarke 3
153
+ ["id_swissreg:mainContent:id_cbxTMTypeGrp", "_ALL"], # Markenart
154
+ ["id_swissreg:mainContent:id_cbxTMForm", "_ALL"], # Markentyp
155
+ ["id_swissreg:mainContent:id_cbxTMColorClaim", "_ALL"], # Farbanspruch
156
+ ["id_swissreg:mainContent:id_txf_pub_date", ""], # Publikationsdatum
157
+
158
+ # info zu Publikationsgrund id_swissreg:mainContent:id_ckbTMPubReason
159
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "1"], #Neueintragungen
160
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "2"], #Berichtigungen
161
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "3"], #Verlängerungen
162
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "4"], #Löschungen
163
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "5"], #Inhaberänderungen
164
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "6"], #Vertreteränderungen
165
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "7"], #Lizenzänderungen
166
+ ["id_swissreg:mainContent:id_ckbTMPubReason", "8"], #Weitere Registeränderungen
167
+ # ["id_swissreg:mainContent:id_ckbTMEmptyHits", "0"], # Leere Trefferliste anzeigen
168
+
169
+ # "id_swissreg:mainContent:id_cbxFormatChoice" 2 = Publikationsansicht 1 = Registeransicht
170
+ ["id_swissreg:mainContent:id_cbxFormatChoice", "1"],
171
+ ["id_swissreg:mainContent:id_cbxHitsPerPage", @hitsPerPage], # Treffer pro Seite
172
+ ]
173
+ TMChoiceFields.each{ | field2display| @criteria << ["id_swissreg:mainContent:id_ckbTMChoice", field2display] }
174
+ # id_swissreg:mainContent:id_ckbTMChoice tm_lbl_tm_text
175
+ puts "Marke ist #{marke}" if marke # Wortlaut der Marke
176
+ puts "Hinterlegungsdatum ist #{timespan}" if $VERBOSE and timespan
177
+ puts "nummer ist #{timespan}" if nummer
178
+ @criteria << ["id_swissreg:mainContent:sub_fieldset:id_submit", "suchen"]
179
+ @criteria << ["id_swissreg_SUBMIT", "1"]
180
+ @criteria << ["id_swissreg:_idcl", ""]
181
+ @criteria << ["id_swissreg:_link_hidden_", ""]
182
+ @criteria << ["javax.faces.ViewState", @state]
183
+
184
+ @path = "/srclient/faces/jsp/trademark/sr3.jsp"
185
+ response = @agent.post(Base_uri + @path, @criteria)
186
+ writeResponse('mechanize/resultate_1.html', response.body)
187
+ checkErrors(response.body)
188
+ @lastResponse = response
189
+ end
190
+
191
+ def parseAddress(nummer, inhaber)
192
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5 = inhaber.split(LineSplit)
193
+ ort = nil
194
+ plz = nil
195
+ if m = AddressRegexp.match(zeile_2)
196
+ zeile_2 = nil
197
+ plz = m[1]; ort = m[2]
198
+ elsif m = AddressRegexp.match(zeile_3)
199
+ zeile_3 = nil
200
+ plz = m[1]; ort = m[2]
201
+ elsif m = AddressRegexp.match(zeile_4)
202
+ zeile_4 = nil
203
+ plz = m[1]; ort = m[2]
204
+ elsif m = AddressRegexp.match(zeile_5)
205
+ zeile_5 = nil
206
+ plz = m[1]; ort = m[2]
207
+ else
208
+ puts "Achtung! Konnte Marke #{nummer} mit Inhaber #{inhaber} nicht parsen" if $VERBOSE
209
+ return nil, nil, nil, nil, nil, nil, nil, nil
210
+ end
211
+ return zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort
212
+ end
213
+
214
+ def fetchDetails(nummer) # takes a long time!
215
+ @counterDetails += 1
216
+ filename = "mechanize/detail_#{nummer}.html"
217
+ if File.exists?(filename)
218
+ doc = Nokogiri::Slop(File.open(filename))
219
+ else
220
+ url = "https://www.swissreg.ch/srclient/faces/jsp/trademark/sr300.jsp?language=de&section=tm&id=#{nummer}"
221
+ pp "Opening #{url}" if $VERBOSE
222
+ content = @agent.get_file url
223
+ writeResponse("mechanize/detail_#{nummer}.html", content)
224
+ doc = Nokogiri::Slop(content)
225
+ end
226
+ puts "Bitte um Geduld. Hole Adressdetails für Marke #{nummer}. (#{@counterDetails} von #{@errors.size})"
227
+ path_name = "//html/body/form/div/div/fieldset/div/table/tbody/tr/td"
228
+ counter = 0
229
+ doc.xpath(path_name).each{
230
+ |td|
231
+ pp "#{counter}: #{td.text}" if $VERBOSE
232
+ counter += 1
233
+ next unless /^inhaber/i.match(td.text)
234
+ zeilen = []
235
+ doc.xpath(path_name)[counter].children.each{ |child| zeilen << child.text.gsub(LineSplit,'. ') unless child.text.length == 0 } # avoid adding <br>
236
+ if info = @errors[nummer]
237
+ info.inhaber = zeilen.join(LineSplit)
238
+ info.zeile_1, info.zeile_2, info.zeile_3, info.zeile_4, zeile_5, info.plz, info.ort = parseAddress(nummer, info.inhaber)
239
+ @results << info
240
+ else
241
+ bezeichnung = doc.xpath(path_name)[15]
242
+ inhaber = zeilen.join(LineSplit)
243
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
244
+ hinterlegungsdatum = doc.xpath(path_name)[7]
245
+ marke = Marke.new(bezeichnung, nummer, inhaber, DefaultCountry, hinterlegungsdatum, zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
246
+ @results << marke
247
+ end
248
+ }
249
+ end
250
+
251
+ def fetchresult(filename = nil, counter = 1)
252
+ if filename
253
+ doc = Nokogiri::Slop(File.open(filename))
254
+ else
255
+ doc = Nokogiri::Slop(@lastResponse.body)
256
+ end
257
+ nrFailures = 0
258
+ counter += 1
259
+ puts "fetchresult. Counter #{counter} already #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'"
260
+ path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tr/td"
261
+ hasNext = false
262
+ doc.xpath(path_name).each{
263
+ |elem|
264
+ if /scroll_1idx#{counter}/.match(elem.to_s)
265
+ hasNext = true
266
+ break
267
+ end
268
+ }
269
+ path_name = "//html/body/form/div/div/fieldset/table/tbody/tr/td/table/tbody/tr"
270
+ doc.xpath(path_name).each{
271
+ |elem|
272
+ bezeichnung = elem.elements[1].text
273
+ land = elem.elements[4].text
274
+ next unless /#{DefaultCountry}/i.match(land)
275
+ inhaber = elem.elements[3].text
276
+ nummer = elem.elements[2].text
277
+ if bezeichnung.length == 0
278
+ bezeichnung = elem.children[1].children[0].children[0].children[0].attribute('src').to_s
279
+ end
280
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort = parseAddress(nummer, inhaber)
281
+ if zeile_1
282
+ @results << Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
283
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
284
+ else
285
+ nrFailures += 1
286
+ @errors[nummer] = Marke.new(bezeichnung, elem.elements[2].text, elem.elements[3].text, land, elem.elements[5].text,
287
+ zeile_1, zeile_2, zeile_3, zeile_4, zeile_5, plz, ort )
288
+ end
289
+ } if doc.xpath(path_name)
290
+ if hasNext
291
+ @path = "/srclient/faces/jsp/trademark/sr30.jsp"
292
+ puts "Calling sub #{counter} with #{@path}" if $VERBOSE
293
+ data = [
294
+ ["autoScroll", "0,0"],
295
+ ["id_swissreg:mainContent:id_sub_options_result:sub_fieldset:id_cbxHitsPerPage", @hitsPerPage],
296
+ # ["id_swissreg:mainContent:vivian", "TRADEMARK REGISTER SEARCH TIMES: QUERY=[20] SELECT=[823] SERVER=[846] DELEGATE=[861] (HITS=[96])"],
297
+ ["id_swissreg_SUBMIT", "1"],
298
+ ["id_swissreg:_idcl", "id_swissreg:mainContent:scroll_1idx#{counter}"],
299
+ ["id_swissreg:mainContent:scroll_1", "idx#{counter}"],
300
+ ["tmMainId", ""],
301
+ ["id_swissreg:_link_hidden_ "],
302
+ ["javax.faces.ViewState", @state],
303
+ ]
304
+ TMChoiceFields.each{ | field2display| data << ["id_swissreg:mainContent:id_sub_options_result:id_ckbTMChoice", field2display] }
305
+ response = @agent.post(Base_uri + @path, data)
306
+ writeResponse("mechanize/resultate_#{counter}.html", response.body)
307
+ checkErrors(response.body)
308
+ @lastResponse = response
309
+ fetchresult(nil, counter)
310
+ else
311
+ puts "Es gab #{nrFailures} Fehler beim Lesen von #{filename}" if $VERBOSE
312
+ puts "Fand #{@results.size} Datensätze für die Zeitspanne '#{@timespan}'. Von #{@errors.size} muss die Adresse noch geholt werden."
313
+ end
314
+ end
315
+
316
+ def emitCsv(filename='ausgabe.csv')
317
+ return if @results.size == 0
318
+ CSV.open(filename, 'w', {:headers=>@results[0].members,
319
+ :write_headers => true}) do |csv|
320
+ @results.each{ |x| csv << x }
321
+ end
322
+ puts "Speicherte #{@results.size} gefunden Datensätze für die Zeitspanne '#{@timespan}' in #{filename}"
323
+ end
324
+
325
+ def fetchMissingDetails
326
+ @errors.each{
327
+ |markennummer, info|
328
+ fetchDetails(markennummer)
329
+ }
330
+ end
331
+ end # class Swissreg
332
+
333
+ def Brand2csv::run(timespan)
334
+ session = Swissreg.new(timespan)
335
+ session.parse_swissreg
336
+ session.fetchresult
337
+ session.fetchMissingDetails
338
+ session.emitCsv
339
+ end
340
+
341
+ end # module Brand2csv
@@ -0,0 +1,3 @@
1
+ module Brand2csv
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,56 @@
1
+ h3. started brand2csv (12 May 2013
2
+
3
+ * Added minimal files to create a Ruby gem
4
+ * Started a spike.rb to fetch some elements from swissreg.ch via mechanize
5
+ * To get familiar with mechanize used the google example
6
+ Had to replace @page.form_with(:name => 'f')@ by @page.form_with(:name => 'gbqf')@
7
+
8
+ * www.swissreg.ch must be opened with agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
9
+
10
+ * Examples of a link to details for a brand record are
11
+ bc. https://www.swissreg.ch/srclient/de/tm/61082/2011
12
+ https://www.swissreg.ch/srclient/de/tm/61082/2011
13
+ https://www.swissreg.ch/srclient/faces/jsp/trademark/sr300.jsp?language=en&section=tm&id=61082/2011
14
+
15
+ * Links
16
+ ** Marken Suchen https://www.swissreg.ch/srclient/faces/jsp/start.jsp
17
+ ** Erweitertete Suchen https://www.swissreg.ch/srclient/faces/jsp/trademark/sr1.jsp
18
+ ** Resultate der Detailsuche unter https://www.swissreg.ch/srclient/faces/jsp/trademark/sr3.jsp
19
+
20
+ Wasted some time to discover that swissreg.rb does not use mechanize, but URI and hpricot to fetch the patent registration.
21
+
22
+ With watir the following few lines sufficed to fetch a detail
23
+
24
+ bc. Swiss_reg_URL = 'https://www.swissreg.ch'
25
+ client = Selenium::WebDriver::Remote::Http::Default.new
26
+ browser = Watir::Browser.new :firefox
27
+ browser.goto Swiss_reg_URL
28
+ browser.link(:id, "id_swissreg_sub_nav_ipiNavigation_item0").click
29
+ browser.link(:id, "id_swissreg_sub_nav_ipiNavigation_item0_item3").click
30
+ browser.text_field(:id, "id_swissreg:mainContent:id_txf_appDate").set("1.10.2011-5.10.2011")
31
+ browser.button(:value,"suchen").click
32
+ browser.link(:id, "id_swissreg:mainContent:data:2:tm_no_detail:id_detail").click# puts browser.text
33
+
34
+ Was not able to create a spike using either mechnize or uri/hpricot to fetch the details.
35
+
36
+ * Thoughts about the CLI interface to csv
37
+
38
+ bc. brand2csh --help
39
+ Useage brand2csh 1.10.2011-5.10.2011 [name_of_brand]
40
+ Fetches brand records from swissreg for the given date range into results.csv.
41
+ Each result contains the following fields
42
+ - date of registration
43
+ - brandname
44
+ - owner of brand
45
+ -- name
46
+ -- addressline1
47
+ -- addressline2 (optional)
48
+ -- zip code
49
+ -- city
50
+ Only owners inside Switzerland will be returned.
51
+
52
+ * Would this be a good extension?
53
+ Accumulate all given results (+ temporary result like info_line_1..x) into a sqlite database.
54
+ Would allow an easy sql manipulation of data for filtering/sorting addresses, etc.
55
+
56
+