justiz 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 79087f670423816d367f2030964da9fb891ebe1f
4
+ data.tar.gz: f312cfac2918d6821bf9dd0efafc953e3c2c4f1c
5
+ SHA512:
6
+ metadata.gz: 47e19fb2ccae3d712dcb17970061050baf6b4725584c51575a1adfd8438bcffb62861a001d593fb2be1bbfb79221b774c385a06e02f41cc870f6cebd6823ec8b
7
+ data.tar.gz: 502813f3a6b29869650375235d5c9f309e5740f3b8ec11c641f44631cda1c933700efaa14bc74986f7c4e6c37cfd222d707083276b110275a17e439e3683eaa1
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .rvmrc
data/.rvmrc ADDED
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 2.0.0" > .rvmrc
9
+ environment_id="ruby-2.0.0-p195@justiz"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.20.13 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ for __hook in "${rvm_path:-$HOME/.rvm}/hooks/after_use"*
27
+ do
28
+ if [[ -f "${__hook}" && -x "${__hook}" && -s "${__hook}" ]]
29
+ then \. "${__hook}" || true
30
+ fi
31
+ done
32
+ unset __hook
33
+ if (( ${rvm_use_flag:=1} >= 2 )) # display only when forced
34
+ then
35
+ if [[ $- == *i* ]] # check for interactive shells
36
+ then printf "%b" "Using: \E[32m$GEM_HOME\E[0m" # show the user the ruby and gemset they are using in green
37
+ else printf "%b" "Using: $GEM_HOME" # don't use colors in non-interactive shells
38
+ fi
39
+ fi
40
+ else
41
+ # If the environment file has not yet been created, use the RVM CLI to select.
42
+ rvm --create "$environment_id" || {
43
+ echo "Failed to create RVM environment '${environment_id}'."
44
+ return 1
45
+ }
46
+ fi
47
+
48
+ # If you use bundler, this might be useful to you:
49
+ # if [[ -s Gemfile ]] && {
50
+ # ! builtin command -v bundle >/dev/null ||
51
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
52
+ # }
53
+ # then
54
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
55
+ # gem install bundler
56
+ # fi
57
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
58
+ # then
59
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
60
+ # fi
data/justiz.gemspec CHANGED
@@ -25,4 +25,6 @@ Gem::Specification.new do |spec|
25
25
  spec.add_development_dependency "rake"
26
26
  spec.add_development_dependency "rspec", "~> 2.6"
27
27
  spec.add_development_dependency "awesome_print"
28
+ spec.add_development_dependency "webmock"
29
+ spec.add_development_dependency "vcr"
28
30
  end
@@ -0,0 +1,26 @@
1
+ module Justiz
2
+ class Address
3
+ attr_reader :text, :city, :plz, :street
4
+
5
+ def initialize(text)
6
+ @text = text
7
+ parse
8
+ end
9
+
10
+ private
11
+
12
+ def parse
13
+ parts = text.split(/\s*,\s*/)
14
+ if parts.length > 1
15
+ @street = parts.slice(0, parts.length - 1).join(", ")
16
+ end
17
+ @plz, @city = parse_city(parts.last)
18
+ end
19
+
20
+ def parse_city(string)
21
+ if string.match(/\s*([0-9]{5})\s*(.*)/i)
22
+ [$1, $2]
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,4 +1,22 @@
1
+ require 'ostruct'
2
+
1
3
  module Justiz
2
- class Contact < Hash
4
+ class Contact < OpenStruct
5
+ # std fields: court, location, post, phone, fax, justiz_id, url, email
6
+
7
+ def id
8
+ # too many duplicates
9
+ #[court, justiz_id].compact.join("")
10
+ # currently no duplicates
11
+ [court, email].compact.join("")
12
+ end
13
+
14
+ def location_address
15
+ Address.new(self[:location])
16
+ end
17
+
18
+ def post_address
19
+ Address.new(self[:post])
20
+ end
3
21
  end
4
22
  end
@@ -3,78 +3,94 @@ module Justiz
3
3
  class Courts
4
4
 
5
5
  def court_types
6
- select_options(home_page, 'gerausw')
6
+ home_page.options_of 'gerausw'
7
7
  end
8
8
 
9
9
  def states
10
- select_options(home_page, 'landausw')
10
+ home_page.options_of 'landausw'
11
11
  end
12
12
 
13
- def scrape(court_type, state)
14
- page = load_page(court_type, state)
15
- # if we reach limit on ALL, query each subtype
16
- if court_type == 'ALL' && limit_warning?(page)
17
- court_types.map do |court_type, description|
18
- next if court_type == 'ALL'
19
- page = load_page(court_type, state)
20
- if limit_warning?(page)
21
- puts(STDERR, "Warning: State #{state} has too many contacts of #{description}[#{court_type}]")
22
- end
23
- parse_page(page)
24
- end.flatten.compact.uniq
25
- else
26
- parse_page(page)
27
- end
13
+ def contacts
14
+ states.keys.map do |state|
15
+ contacts_for(state)
16
+ end.flatten.compact
17
+ end
18
+
19
+ def contacts_for(state)
20
+ page = load_page('ALL', state)
21
+ return page.contacts unless page.limit_warning?
22
+
23
+ # do each type separately hoping to avoid limit warning
24
+ court_types.keys.map do |court_type|
25
+ contacts_of_type(court_type, state)
26
+ end.flatten.compact.uniq
27
+ end
28
+
29
+ def contacts_of_type(type, state)
30
+ load_page(type, state, with_warning: true).contacts
28
31
  end
29
32
 
30
33
  private
31
34
 
32
35
  def home_page
33
- @home_page ||= agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw')
36
+ @home_page ||= Page.new(agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw'))
34
37
  end
35
38
 
36
- def load_page(court_type, state)
39
+ def load_page(court_type, state, options = {})
37
40
  form = home_page.forms[2]
38
41
  form['gerausw'] = court_type
39
42
  form['landausw'] = state
40
- agent.submit(form, form.buttons_with(name: 'suchen1').first)
43
+ page = Page.new(agent.submit(form, form.buttons_with(name: 'suchen1').first))
44
+ if options[:with_warning] && page.limit_warning?
45
+ puts(STDERR, "Warning: State #{state} has too many contacts of #{court_type}")
46
+ end
47
+ page
41
48
  end
42
49
 
43
50
  def agent
44
51
  @agent ||= Justiz::Scraper::Agent.new
45
52
  end
53
+ end
46
54
 
47
- def parse_page(page)
48
- rows = page.search('tr').map { |tr| tr.search('td').to_a }
49
- contact_rows = rows.find_all { |row| row.length == 3 }
50
- contact_rows.map do |court, addresses, phones|
51
- addresses = AddressTd.new(addresses)
52
- phones = AddressTd.new(phones)
53
-
54
- Justiz::Contact.new.merge court: court.text.strip,
55
- location: addresses.lieferanschrift,
56
- post: addresses.postfach,
57
- phone: phones.telefone,
58
- fax: phones.fax,
59
- justiz_id: phones.justiz_id,
60
- url: phones.url,
61
- email: phones.email
55
+ class Page < SimpleDelegator
56
+ def limit_warning?
57
+ # avoid invalid UTF-8 errors by force encoding.
58
+ search('p').find do |p|
59
+ p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i
62
60
  end
63
61
  end
64
62
 
65
- def limit_warning?(page)
66
- # avoid invalid UTF-8 errors by force encoding.
67
- page.search('p').find {|p| p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i}
63
+ # return hash of options of select field, exclude ALL value
64
+ def options_of(name)
65
+ search("[name='#{name}'] > option").inject({}) do |memo, node|
66
+ memo[node['value']] = node.text unless node['value'] == 'ALL'
67
+ memo
68
+ end
68
69
  end
69
70
 
70
- def select_options(page, name)
71
- page.search("[name='#{name}'] > option").inject({}) do |memo, node|
72
- memo[node['value']] = node.text
73
- memo
71
+ def contacts
72
+ @contacts ||= parse_contacts
73
+ end
74
+
75
+ def parse_contacts
76
+ rows = search('tr').map { |tr| tr.search('td').to_a }
77
+ contact_rows = rows.find_all { |row| row.length == 3 }
78
+ contact_rows.map do |court, addresses, kontakt|
79
+ addresses = AddressTd.new(addresses)
80
+ kontakt = KontaktTd.new(kontakt)
81
+
82
+ Justiz::Contact.new(court: court.text.strip,
83
+ location: addresses.lieferanschrift,
84
+ post: addresses.postfach,
85
+ phone: kontakt.telefone,
86
+ fax: kontakt.fax,
87
+ justiz_id: kontakt.justiz_id,
88
+ url: kontakt.url,
89
+ email: kontakt.email)
74
90
  end
75
91
  end
76
92
 
77
- class AddressTd
93
+ class Td
78
94
  attr_reader :texts
79
95
 
80
96
  def initialize(node)
@@ -82,18 +98,27 @@ module Justiz
82
98
  @texts = nodes.map { |n| n.text.strip }.find_all { |t| !blank?(t) }
83
99
  end
84
100
 
85
- def telefone
86
- same_line('Telefon:')
101
+ private
102
+
103
+ def blank?(something)
104
+ something.to_s !~ /[^[:space:]]/
87
105
  end
88
106
 
89
- def fax
90
- same_line('Fax:')
107
+ def next_line(name)
108
+ reg = Regexp.new(name, true)
109
+ line = texts.find_index { |text| text.match(reg) }
110
+ line && texts[line + 1]
91
111
  end
92
112
 
93
- def justiz_id
94
- same_line('XJustiz-ID:')
113
+ def same_line(name)
114
+ reg = Regexp.new("#{name}(.*)", true)
115
+ text = texts.map { |t| t.match(reg) }.compact.first
116
+ text = text[1].strip if text
117
+ text if !blank?(text)
95
118
  end
119
+ end
96
120
 
121
+ class AddressTd < Td
97
122
  def lieferanschrift
98
123
  next_line('Lieferanschrift')
99
124
  end
@@ -101,32 +126,32 @@ module Justiz
101
126
  def postfach
102
127
  next_line('Postanschrift')
103
128
  end
129
+ end
104
130
 
105
- def url
106
- next_line('URL')
107
- end
131
+ class KontaktTd < Td
132
+ attr_reader :url
108
133
 
109
- def email
110
- next_line('Mail')
134
+ def initialize(node)
135
+ super
136
+ if (a = node.search('a').first)
137
+ @url = a['href']
138
+ end
111
139
  end
112
140
 
113
- private
141
+ def telefone
142
+ same_line('Telefon:')
143
+ end
114
144
 
115
- def blank?(something)
116
- something.to_s !~ /[^[:space:]]/
145
+ def fax
146
+ same_line('Fax:')
117
147
  end
118
148
 
119
- def next_line(name)
120
- reg = Regexp.new(name, true)
121
- line = texts.find_index { |text| text.match(reg) }
122
- line && texts[line + 1]
149
+ def justiz_id
150
+ same_line('XJustiz-ID:')
123
151
  end
124
152
 
125
- def same_line(name)
126
- reg = Regexp.new("#{name}(.*)", true)
127
- text = texts.map { |t| t.match(reg) }.compact.first
128
- text = text[1].strip if text
129
- text if !blank?(text)
153
+ def email
154
+ next_line('Mail')
130
155
  end
131
156
  end
132
157
  end
@@ -1,3 +1,3 @@
1
1
  module Justiz
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
data/lib/justiz.rb CHANGED
@@ -2,6 +2,7 @@ require "justiz/version"
2
2
  require 'justiz/cli'
3
3
  require 'justiz/scraper/agent'
4
4
  require 'justiz/scraper/courts'
5
+ require 'justiz/address'
5
6
  require 'justiz/contact'
6
7
 
7
8
  module Justiz
@@ -0,0 +1,11 @@
1
+ require_relative '../../spec/lib/spec_helper'
2
+
3
+ describe Justiz::Address do
4
+ context "simple address" do
5
+ subject { Justiz::Address.new('Schlossbezirk 3, 76131 Karlsruhe') }
6
+
7
+ it { expect(subject.city).to eq 'Karlsruhe' }
8
+ it { expect(subject.street).to eq 'Schlossbezirk 3' }
9
+ it { expect(subject.plz).to eq '76131' }
10
+ end
11
+ end
@@ -1,13 +1,10 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'rspec'
4
- require 'justiz'
5
- require 'awesome_print'
3
+ require_relative '../../lib/spec_helper'
6
4
 
7
5
  describe Justiz::Scraper::Courts do
8
6
  it "should have court types" do
9
7
  types = {
10
- "ALL" => "-- alle Gerichte/Behörden --",
11
8
  "AG" => "Amtsgerichte",
12
9
  "LG" => "Landgerichte",
13
10
  "OLG" => "Oberlandesgerichte",
@@ -29,12 +26,13 @@ describe Justiz::Scraper::Courts do
29
26
  "JVA" => "Vollzugseinrichtungen",
30
27
  "AND" => "Sonstige Justizbehörden"
31
28
  }
32
- expect(subject.court_types).to eq(types)
29
+ VCR.use_cassette 'courts/homepage' do
30
+ expect(subject.court_types).to eq(types)
31
+ end
33
32
  end
34
33
 
35
34
  it "should have states" do
36
35
  types = {
37
- "ALL" => "-- Auswahl über PLZ/Ort --",
38
36
  "BRD" => "Bundesgerichte/-behörden",
39
37
  "BW" => "Baden-Württemberg",
40
38
  "BAY" => "Bayern",
@@ -53,56 +51,156 @@ describe Justiz::Scraper::Courts do
53
51
  "SH" => "Schleswig-Holstein",
54
52
  "TH" => "Thüringen"
55
53
  }
56
- expect(subject.states).to eq(types)
54
+ VCR.use_cassette 'courts/homepage' do
55
+ expect(subject.states).to eq(types)
56
+ end
57
57
  end
58
58
 
59
- context "Bundesgerichte" do
60
- it "should find all Bundesgerichte" do
61
- contacts = subject.scrape('ALL', 'BRD')
62
- #ap contacts
63
- expect(contacts.count).to eq(12)
59
+ context "entry counts" do
60
+ context "Bundesgerichte" do
61
+ it "should find all Bundesgerichte" do
62
+ VCR.use_cassette 'courts/all_brd' do
63
+ contacts = subject.contacts_for('BRD')
64
+ #ap contacts
65
+ expect(contacts.count).to eq(12)
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ context "NRW" do
72
+ it "should find all NRW" do
73
+ VCR.use_cassette 'courts/all_nrw' do
74
+ contacts = subject.contacts_for('NRW')
75
+ #ap contacts
76
+ expect(contacts.count).to eq(513)
77
+ end
78
+ end
79
+ end
80
+
81
+ context "search all" do
82
+ it "should find all entries" do
83
+ VCR.use_cassette 'courts/all_all' do
84
+ original = {
85
+ "BRD" => "Bundesgerichte/-behörden 12",
86
+ "BW" => "Baden-Württemberg 430",
87
+ "BAY" => "Bayern 348",
88
+ "B" => "Berlin 38",
89
+ "BRA" => "Brandenburg 64",
90
+ "BRE" => "Bremen 19",
91
+ "HH" => "Hamburg 35",
92
+ "HES" => "Hessen 115",
93
+ "MV" => "Mecklenburg-Vorpommern 55",
94
+ "NS" => "Niedersachsen 305",
95
+ "NRW" => "Nordrhein-Westfalen 513",
96
+ "RPF" => "Rheinland-Pfalz 101",
97
+ "SAA" => "Saarland 32",
98
+ "SAC" => "Sachsen 79",
99
+ "SAH" => "Sachsen-Anhalt 69",
100
+ "SH" => "Schleswig-Holstein 58",
101
+ "TH" => "Thüringen 63"
102
+ }
103
+ states = subject.states
104
+ total = 0
105
+ states.keys.each do |state|
106
+ count = subject.contacts_for(state).count
107
+ states[state] += " #{count}"
108
+ total += count
109
+ end
110
+ expect(states).to eq(original)
111
+ expect(total).to eq(2336)
112
+ end
113
+ end
64
114
  end
65
115
  end
66
116
 
117
+ context "id" do
118
+ it "dumps non-unique ids" do
119
+ msg = []
120
+ VCR.use_cassette 'courts/all_all' do
121
+ subject.contacts.inject({}) do |memo, c|
122
+ k = c.id
123
+ if memo.has_key?(k)
124
+ msg << "#{memo[k]}\n#{c}"
125
+ else
126
+ memo[k] = c
127
+ end
128
+ memo
129
+ end
130
+ expect(msg).to eq([])
131
+ end
132
+ end
67
133
 
68
- context "NRW" do
69
- it "should find all NRW" do
70
- contacts = subject.scrape('ALL', 'NRW')
71
- #ap contacts
72
- expect(contacts.count).to eq(513)
134
+ it "should have unique court names" do
135
+ VCR.use_cassette 'courts/all_all' do
136
+ ids = subject.contacts.map(&:id)
137
+ expect(ids.uniq.length).to eq(ids.length)
138
+ end
73
139
  end
74
140
  end
75
141
 
76
- context "search all" do
77
- it "should find all entries" do
78
- # as of 13.6.2013
79
- original = {
80
- "ALL" => "-- Auswahl über PLZ/Ort -- 0",
81
- "BRD" => "Bundesgerichte/-behörden 12",
82
- "BW" => "Baden-Württemberg 430",
83
- "BAY" => "Bayern 348",
84
- "B" => "Berlin 38",
85
- "BRA" => "Brandenburg 64",
86
- "BRE" => "Bremen 19",
87
- "HH" => "Hamburg 35",
88
- "HES" => "Hessen 115",
89
- "MV" => "Mecklenburg-Vorpommern 55",
90
- "NS" => "Niedersachsen 305",
91
- "NRW" => "Nordrhein-Westfalen 513",
92
- "RPF" => "Rheinland-Pfalz 101",
93
- "SAA" => "Saarland 32",
94
- "SAC" => "Sachsen 79",
95
- "SAH" => "Sachsen-Anhalt 69",
96
- "SH" => "Schleswig-Holstein 58",
97
- "TH" => "Thüringen 63"
98
- }
99
- states = subject.states
100
- states.keys.each do |state|
101
- count = subject.scrape('ALL', state).count
102
- states[state] += " #{count}"
142
+
143
+ context "contact details" do
144
+ it "parses first url from multiple" do
145
+ VCR.use_cassette 'courts/sg_b' do
146
+ contacts = subject.contacts_of_type('SG', 'B')
147
+ expect(contacts.length).to eq(2)
148
+ expect(contacts.first.url).to eq('http://www.berlin.de/lsg')
149
+ end
150
+ end
151
+
152
+ it "should have full location addresses" do
153
+ VCR.use_cassette 'courts/all_all' do
154
+ failed = false
155
+ subject.contacts.each do |contact|
156
+ a = contact.location_address
157
+ unless a.street && a.plz && a.city
158
+ ap a
159
+ failed = true
160
+ end
161
+ end
162
+ expect(failed).to eq(false)
163
+ end
164
+ end
165
+
166
+ it "should have plz & city post addresses" do
167
+ VCR.use_cassette 'courts/all_all' do
168
+ failed = false
169
+ subject.contacts.each do |contact|
170
+ a = contact.location_address
171
+ unless a.plz && a.city
172
+ ap a
173
+ failed = true
174
+ end
175
+ end
176
+ expect(failed).to eq(false)
177
+ end
178
+ end
179
+
180
+ xit "dumps contact in csv format" do
181
+ require 'csv'
182
+ VCR.use_cassette 'courts/all_all' do
183
+ CSV.open("address.csv", "wb") do |csv|
184
+ subject.contacts.each do |c|
185
+ l = c.location_address
186
+ p = c.post_address
187
+ csv << [c.justiz_id, c.court,
188
+ l.street, l.plz, l.city,
189
+ p.street, p.plz, p.city,
190
+ c.phone, c.fax, c.email, c.url]
191
+ end
192
+ end
193
+ end
194
+ end
195
+
196
+ it "should return Address" do
197
+ VCR.use_cassette 'courts/all_brd' do
198
+ contact = subject.contacts_for('BRD').first
199
+ expect(contact.location_address).to be_a(Justiz::Address)
200
+ expect(contact.post_address).to be_a(Justiz::Address)
103
201
  end
104
- expect(states).to eq(original)
105
202
  end
106
203
  end
107
204
  end
108
205
 
206
+
@@ -0,0 +1,9 @@
1
+ require 'rspec'
2
+ require 'justiz'
3
+ require 'awesome_print'
4
+ require 'vcr'
5
+
6
+ VCR.configure do |c|
7
+ c.cassette_library_dir = 'spec/vcr'
8
+ c.hook_into :webmock
9
+ end