justiz 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 79087f670423816d367f2030964da9fb891ebe1f
4
+ data.tar.gz: f312cfac2918d6821bf9dd0efafc953e3c2c4f1c
5
+ SHA512:
6
+ metadata.gz: 47e19fb2ccae3d712dcb17970061050baf6b4725584c51575a1adfd8438bcffb62861a001d593fb2be1bbfb79221b774c385a06e02f41cc870f6cebd6823ec8b
7
+ data.tar.gz: 502813f3a6b29869650375235d5c9f309e5740f3b8ec11c641f44631cda1c933700efaa14bc74986f7c4e6c37cfd222d707083276b110275a17e439e3683eaa1
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .rvmrc
data/.rvmrc ADDED
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 2.0.0" > .rvmrc
9
+ environment_id="ruby-2.0.0-p195@justiz"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.20.13 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ for __hook in "${rvm_path:-$HOME/.rvm}/hooks/after_use"*
27
+ do
28
+ if [[ -f "${__hook}" && -x "${__hook}" && -s "${__hook}" ]]
29
+ then \. "${__hook}" || true
30
+ fi
31
+ done
32
+ unset __hook
33
+ if (( ${rvm_use_flag:=1} >= 2 )) # display only when forced
34
+ then
35
+ if [[ $- == *i* ]] # check for interactive shells
36
+ then printf "%b" "Using: \E[32m$GEM_HOME\E[0m" # show the user the ruby and gemset they are using in green
37
+ else printf "%b" "Using: $GEM_HOME" # don't use colors in non-interactive shells
38
+ fi
39
+ fi
40
+ else
41
+ # If the environment file has not yet been created, use the RVM CLI to select.
42
+ rvm --create "$environment_id" || {
43
+ echo "Failed to create RVM environment '${environment_id}'."
44
+ return 1
45
+ }
46
+ fi
47
+
48
+ # If you use bundler, this might be useful to you:
49
+ # if [[ -s Gemfile ]] && {
50
+ # ! builtin command -v bundle >/dev/null ||
51
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
52
+ # }
53
+ # then
54
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
55
+ # gem install bundler
56
+ # fi
57
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
58
+ # then
59
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
60
+ # fi
data/justiz.gemspec CHANGED
@@ -25,4 +25,6 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency "rake"
26
26
  spec.add_development_dependency "rspec", "~> 2.6"
27
27
  spec.add_development_dependency "awesome_print"
28
+ spec.add_development_dependency "webmock"
29
+ spec.add_development_dependency "vcr"
28
30
  end
@@ -0,0 +1,26 @@
1
+ module Justiz
2
+ class Address
3
+ attr_reader :text, :city, :plz, :street
4
+
5
+ def initialize(text)
6
+ @text = text
7
+ parse
8
+ end
9
+
10
+ private
11
+
12
+ def parse
13
+ parts = text.split(/\s*,\s*/)
14
+ if parts.length > 1
15
+ @street = parts.slice(0, parts.length - 1).join(", ")
16
+ end
17
+ @plz, @city = parse_city(parts.last)
18
+ end
19
+
20
+ def parse_city(string)
21
+ if string.match(/\s*([0-9]{5})\s*(.*)/i)
22
+ [$1, $2]
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,4 +1,22 @@
1
+ require 'ostruct'
2
+
1
3
  module Justiz
2
- class Contact < Hash
4
+ class Contact < OpenStruct
5
+ # std fields: court, location, post, phone, fax, justiz_id, url, email
6
+
7
+ def id
8
+ # too many duplicates
9
+ #[court, justiz_id].compact.join("")
10
+ # currently no duplicates
11
+ [court, email].compact.join("")
12
+ end
13
+
14
+ def location_address
15
+ Address.new(self[:location])
16
+ end
17
+
18
+ def post_address
19
+ Address.new(self[:post])
20
+ end
3
21
  end
4
22
  end
@@ -3,78 +3,94 @@ module Justiz
3
3
  class Courts
4
4
 
5
5
  def court_types
6
- select_options(home_page, 'gerausw')
6
+ home_page.options_of 'gerausw'
7
7
  end
8
8
 
9
9
  def states
10
- select_options(home_page, 'landausw')
10
+ home_page.options_of 'landausw'
11
11
  end
12
12
 
13
- def scrape(court_type, state)
14
- page = load_page(court_type, state)
15
- # if we reach limit on ALL, query each subtype
16
- if court_type == 'ALL' && limit_warning?(page)
17
- court_types.map do |court_type, description|
18
- next if court_type == 'ALL'
19
- page = load_page(court_type, state)
20
- if limit_warning?(page)
21
- puts(STDERR, "Warning: State #{state} has too many contacts of #{description}[#{court_type}]")
22
- end
23
- parse_page(page)
24
- end.flatten.compact.uniq
25
- else
26
- parse_page(page)
27
- end
13
+ def contacts
14
+ states.keys.map do |state|
15
+ contacts_for(state)
16
+ end.flatten.compact
17
+ end
18
+
19
+ def contacts_for(state)
20
+ page = load_page('ALL', state)
21
+ return page.contacts unless page.limit_warning?
22
+
23
+ # do each type separately hoping to avoid limit warning
24
+ court_types.keys.map do |court_type|
25
+ contacts_of_type(court_type, state)
26
+ end.flatten.compact.uniq
27
+ end
28
+
29
+ def contacts_of_type(type, state)
30
+ load_page(type, state, with_warning: true).contacts
28
31
  end
29
32
 
30
33
  private
31
34
 
32
35
  def home_page
33
- @home_page ||= agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw')
36
+ @home_page ||= Page.new(agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw'))
34
37
  end
35
38
 
36
- def load_page(court_type, state)
39
+ def load_page(court_type, state, options = {})
37
40
  form = home_page.forms[2]
38
41
  form['gerausw'] = court_type
39
42
  form['landausw'] = state
40
- agent.submit(form, form.buttons_with(name: 'suchen1').first)
43
+ page = Page.new(agent.submit(form, form.buttons_with(name: 'suchen1').first))
44
+ if options[:with_warning] && page.limit_warning?
45
+ puts(STDERR, "Warning: State #{state} has too many contacts of #{court_type}")
46
+ end
47
+ page
41
48
  end
42
49
 
43
50
  def agent
44
51
  @agent ||= Justiz::Scraper::Agent.new
45
52
  end
53
+ end
46
54
 
47
- def parse_page(page)
48
- rows = page.search('tr').map { |tr| tr.search('td').to_a }
49
- contact_rows = rows.find_all { |row| row.length == 3 }
50
- contact_rows.map do |court, addresses, phones|
51
- addresses = AddressTd.new(addresses)
52
- phones = AddressTd.new(phones)
53
-
54
- Justiz::Contact.new.merge court: court.text.strip,
55
- location: addresses.lieferanschrift,
56
- post: addresses.postfach,
57
- phone: phones.telefone,
58
- fax: phones.fax,
59
- justiz_id: phones.justiz_id,
60
- url: phones.url,
61
- email: phones.email
55
+ class Page < SimpleDelegator
56
+ def limit_warning?
57
+ # avoid invalid UTF-8 errors by force encoding.
58
+ search('p').find do |p|
59
+ p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i
62
60
  end
63
61
  end
64
62
 
65
- def limit_warning?(page)
66
- # avoid invalid UTF-8 errors by force encoding.
67
- page.search('p').find {|p| p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i}
63
+ # return hash of options of select field, exclude ALL value
64
+ def options_of(name)
65
+ search("[name='#{name}'] > option").inject({}) do |memo, node|
66
+ memo[node['value']] = node.text unless node['value'] == 'ALL'
67
+ memo
68
+ end
68
69
  end
69
70
 
70
- def select_options(page, name)
71
- page.search("[name='#{name}'] > option").inject({}) do |memo, node|
72
- memo[node['value']] = node.text
73
- memo
71
+ def contacts
72
+ @contacts ||= parse_contacts
73
+ end
74
+
75
+ def parse_contacts
76
+ rows = search('tr').map { |tr| tr.search('td').to_a }
77
+ contact_rows = rows.find_all { |row| row.length == 3 }
78
+ contact_rows.map do |court, addresses, kontakt|
79
+ addresses = AddressTd.new(addresses)
80
+ kontakt = KontaktTd.new(kontakt)
81
+
82
+ Justiz::Contact.new(court: court.text.strip,
83
+ location: addresses.lieferanschrift,
84
+ post: addresses.postfach,
85
+ phone: kontakt.telefone,
86
+ fax: kontakt.fax,
87
+ justiz_id: kontakt.justiz_id,
88
+ url: kontakt.url,
89
+ email: kontakt.email)
74
90
  end
75
91
  end
76
92
 
77
- class AddressTd
93
+ class Td
78
94
  attr_reader :texts
79
95
 
80
96
  def initialize(node)
@@ -82,18 +98,27 @@ module Justiz
82
98
  @texts = nodes.map { |n| n.text.strip }.find_all { |t| !blank?(t) }
83
99
  end
84
100
 
85
- def telefone
86
- same_line('Telefon:')
101
+ private
102
+
103
+ def blank?(something)
104
+ something.to_s !~ /[^[:space:]]/
87
105
  end
88
106
 
89
- def fax
90
- same_line('Fax:')
107
+ def next_line(name)
108
+ reg = Regexp.new(name, true)
109
+ line = texts.find_index { |text| text.match(reg) }
110
+ line && texts[line + 1]
91
111
  end
92
112
 
93
- def justiz_id
94
- same_line('XJustiz-ID:')
113
+ def same_line(name)
114
+ reg = Regexp.new("#{name}(.*)", true)
115
+ text = texts.map { |t| t.match(reg) }.compact.first
116
+ text = text[1].strip if text
117
+ text if !blank?(text)
95
118
  end
119
+ end
96
120
 
121
+ class AddressTd < Td
97
122
  def lieferanschrift
98
123
  next_line('Lieferanschrift')
99
124
  end
@@ -101,32 +126,32 @@ module Justiz
101
126
  def postfach
102
127
  next_line('Postanschrift')
103
128
  end
129
+ end
104
130
 
105
- def url
106
- next_line('URL')
107
- end
131
+ class KontaktTd < Td
132
+ attr_reader :url
108
133
 
109
- def email
110
- next_line('Mail')
134
+ def initialize(node)
135
+ super
136
+ if (a = node.search('a').first)
137
+ @url = a['href']
138
+ end
111
139
  end
112
140
 
113
- private
141
+ def telefone
142
+ same_line('Telefon:')
143
+ end
114
144
 
115
- def blank?(something)
116
- something.to_s !~ /[^[:space:]]/
145
+ def fax
146
+ same_line('Fax:')
117
147
  end
118
148
 
119
- def next_line(name)
120
- reg = Regexp.new(name, true)
121
- line = texts.find_index { |text| text.match(reg) }
122
- line && texts[line + 1]
149
+ def justiz_id
150
+ same_line('XJustiz-ID:')
123
151
  end
124
152
 
125
- def same_line(name)
126
- reg = Regexp.new("#{name}(.*)", true)
127
- text = texts.map { |t| t.match(reg) }.compact.first
128
- text = text[1].strip if text
129
- text if !blank?(text)
153
+ def email
154
+ next_line('Mail')
130
155
  end
131
156
  end
132
157
  end
@@ -1,3 +1,3 @@
1
1
  module Justiz
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/justiz.rb CHANGED
@@ -2,6 +2,7 @@ require "justiz/version"
2
2
  require 'justiz/cli'
3
3
  require 'justiz/scraper/agent'
4
4
  require 'justiz/scraper/courts'
5
+ require 'justiz/address'
5
6
  require 'justiz/contact'
6
7
 
7
8
  module Justiz
@@ -0,0 +1,11 @@
1
+ require_relative '../../spec/lib/spec_helper'
2
+
3
+ describe Justiz::Address do
4
+ context "simple address" do
5
+ subject { Justiz::Address.new('Schlossbezirk 3, 76131 Karlsruhe') }
6
+
7
+ it { expect(subject.city).to eq 'Karlsruhe' }
8
+ it { expect(subject.street).to eq 'Schlossbezirk 3' }
9
+ it { expect(subject.plz).to eq '76131' }
10
+ end
11
+ end
@@ -1,13 +1,10 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'rspec'
4
- require 'justiz'
5
- require 'awesome_print'
3
+ require_relative '../../lib/spec_helper'
6
4
 
7
5
  describe Justiz::Scraper::Courts do
8
6
  it "should have court types" do
9
7
  types = {
10
- "ALL" => "-- alle Gerichte/Behörden --",
11
8
  "AG" => "Amtsgerichte",
12
9
  "LG" => "Landgerichte",
13
10
  "OLG" => "Oberlandesgerichte",
@@ -29,12 +26,13 @@ describe Justiz::Scraper::Courts do
29
26
  "JVA" => "Vollzugseinrichtungen",
30
27
  "AND" => "Sonstige Justizbehörden"
31
28
  }
32
- expect(subject.court_types).to eq(types)
29
+ VCR.use_cassette 'courts/homepage' do
30
+ expect(subject.court_types).to eq(types)
31
+ end
33
32
  end
34
33
 
35
34
  it "should have states" do
36
35
  types = {
37
- "ALL" => "-- Auswahl über PLZ/Ort --",
38
36
  "BRD" => "Bundesgerichte/-behörden",
39
37
  "BW" => "Baden-Württemberg",
40
38
  "BAY" => "Bayern",
@@ -53,56 +51,156 @@ describe Justiz::Scraper::Courts do
53
51
  "SH" => "Schleswig-Holstein",
54
52
  "TH" => "Thüringen"
55
53
  }
56
- expect(subject.states).to eq(types)
54
+ VCR.use_cassette 'courts/homepage' do
55
+ expect(subject.states).to eq(types)
56
+ end
57
57
  end
58
58
 
59
- context "Bundesgerichte" do
60
- it "should find all Bundesgerichte" do
61
- contacts = subject.scrape('ALL', 'BRD')
62
- #ap contacts
63
- expect(contacts.count).to eq(12)
59
+ context "entry counts" do
60
+ context "Bundesgerichte" do
61
+ it "should find all Bundesgerichte" do
62
+ VCR.use_cassette 'courts/all_brd' do
63
+ contacts = subject.contacts_for('BRD')
64
+ #ap contacts
65
+ expect(contacts.count).to eq(12)
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ context "NRW" do
72
+ it "should find all NRW" do
73
+ VCR.use_cassette 'courts/all_nrw' do
74
+ contacts = subject.contacts_for('NRW')
75
+ #ap contacts
76
+ expect(contacts.count).to eq(513)
77
+ end
78
+ end
79
+ end
80
+
81
+ context "search all" do
82
+ it "should find all entries" do
83
+ VCR.use_cassette 'courts/all_all' do
84
+ original = {
85
+ "BRD" => "Bundesgerichte/-behörden 12",
86
+ "BW" => "Baden-Württemberg 430",
87
+ "BAY" => "Bayern 348",
88
+ "B" => "Berlin 38",
89
+ "BRA" => "Brandenburg 64",
90
+ "BRE" => "Bremen 19",
91
+ "HH" => "Hamburg 35",
92
+ "HES" => "Hessen 115",
93
+ "MV" => "Mecklenburg-Vorpommern 55",
94
+ "NS" => "Niedersachsen 305",
95
+ "NRW" => "Nordrhein-Westfalen 513",
96
+ "RPF" => "Rheinland-Pfalz 101",
97
+ "SAA" => "Saarland 32",
98
+ "SAC" => "Sachsen 79",
99
+ "SAH" => "Sachsen-Anhalt 69",
100
+ "SH" => "Schleswig-Holstein 58",
101
+ "TH" => "Thüringen 63"
102
+ }
103
+ states = subject.states
104
+ total = 0
105
+ states.keys.each do |state|
106
+ count = subject.contacts_for(state).count
107
+ states[state] += " #{count}"
108
+ total += count
109
+ end
110
+ expect(states).to eq(original)
111
+ expect(total).to eq(2336)
112
+ end
113
+ end
64
114
  end
65
115
  end
66
116
 
117
+ context "id" do
118
+ it "dumps non-unique ids" do
119
+ msg = []
120
+ VCR.use_cassette 'courts/all_all' do
121
+ subject.contacts.inject({}) do |memo, c|
122
+ k = c.id
123
+ if memo.has_key?(k)
124
+ msg << "#{memo[k]}\n#{c}"
125
+ else
126
+ memo[k] = c
127
+ end
128
+ memo
129
+ end
130
+ expect(msg).to eq([])
131
+ end
132
+ end
67
133
 
68
- context "NRW" do
69
- it "should find all NRW" do
70
- contacts = subject.scrape('ALL', 'NRW')
71
- #ap contacts
72
- expect(contacts.count).to eq(513)
134
+ it "should have unique court names" do
135
+ VCR.use_cassette 'courts/all_all' do
136
+ ids = subject.contacts.map(&:id)
137
+ expect(ids.uniq.length).to eq(ids.length)
138
+ end
73
139
  end
74
140
  end
75
141
 
76
- context "search all" do
77
- it "should find all entries" do
78
- # as of 13.6.2013
79
- original = {
80
- "ALL" => "-- Auswahl über PLZ/Ort -- 0",
81
- "BRD" => "Bundesgerichte/-behörden 12",
82
- "BW" => "Baden-Württemberg 430",
83
- "BAY" => "Bayern 348",
84
- "B" => "Berlin 38",
85
- "BRA" => "Brandenburg 64",
86
- "BRE" => "Bremen 19",
87
- "HH" => "Hamburg 35",
88
- "HES" => "Hessen 115",
89
- "MV" => "Mecklenburg-Vorpommern 55",
90
- "NS" => "Niedersachsen 305",
91
- "NRW" => "Nordrhein-Westfalen 513",
92
- "RPF" => "Rheinland-Pfalz 101",
93
- "SAA" => "Saarland 32",
94
- "SAC" => "Sachsen 79",
95
- "SAH" => "Sachsen-Anhalt 69",
96
- "SH" => "Schleswig-Holstein 58",
97
- "TH" => "Thüringen 63"
98
- }
99
- states = subject.states
100
- states.keys.each do |state|
101
- count = subject.scrape('ALL', state).count
102
- states[state] += " #{count}"
142
+
143
+ context "contact details" do
144
+ it "parses first url from multiple" do
145
+ VCR.use_cassette 'courts/sg_b' do
146
+ contacts = subject.contacts_of_type('SG', 'B')
147
+ expect(contacts.length).to eq(2)
148
+ expect(contacts.first.url).to eq('http://www.berlin.de/lsg')
149
+ end
150
+ end
151
+
152
+ it "should have full location addresses" do
153
+ VCR.use_cassette 'courts/all_all' do
154
+ failed = false
155
+ subject.contacts.each do |contact|
156
+ a = contact.location_address
157
+ unless a.street && a.plz && a.city
158
+ ap a
159
+ failed = true
160
+ end
161
+ end
162
+ expect(failed).to eq(false)
163
+ end
164
+ end
165
+
166
+ it "should have plz & city post addresses" do
167
+ VCR.use_cassette 'courts/all_all' do
168
+ failed = false
169
+ subject.contacts.each do |contact|
170
+ a = contact.location_address
171
+ unless a.plz && a.city
172
+ ap a
173
+ failed = true
174
+ end
175
+ end
176
+ expect(failed).to eq(false)
177
+ end
178
+ end
179
+
180
+ xit "dumps contact in csv format" do
181
+ require 'csv'
182
+ VCR.use_cassette 'courts/all_all' do
183
+ CSV.open("address.csv", "wb") do |csv|
184
+ subject.contacts.each do |c|
185
+ l = c.location_address
186
+ p = c.post_address
187
+ csv << [c.justiz_id, c.court,
188
+ l.street, l.plz, l.city,
189
+ p.street, p.plz, p.city,
190
+ c.phone, c.fax, c.email, c.url]
191
+ end
192
+ end
193
+ end
194
+ end
195
+
196
+ it "should return Address" do
197
+ VCR.use_cassette 'courts/all_brd' do
198
+ contact = subject.contacts_for('BRD').first
199
+ expect(contact.location_address).to be_a(Justiz::Address)
200
+ expect(contact.post_address).to be_a(Justiz::Address)
103
201
  end
104
- expect(states).to eq(original)
105
202
  end
106
203
  end
107
204
  end
108
205
 
206
+
@@ -0,0 +1,9 @@
1
+ require 'rspec'
2
+ require 'justiz'
3
+ require 'awesome_print'
4
+ require 'vcr'
5
+
6
+ VCR.configure do |c|
7
+ c.cassette_library_dir = 'spec/vcr'
8
+ c.hook_into :webmock
9
+ end