justiz 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/justiz.gemspec +1 -1
- data/lib/justiz/contact.rb +29 -2
- data/lib/justiz/scraper/courts.rb +12 -5
- data/lib/justiz/version.rb +1 -1
- data/spec/lib/contact_spec.rb +12 -0
- data/spec/lib/scraper/courts_spec.rb +13 -6
- data/spec/vcr/courts/zvg_nrw.yml +3009 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2956b90485c84af57cd711724282b36debf63a3c
|
4
|
+
data.tar.gz: 09412689d63cb8fd117e9f06b7bb9f7a3fcc49b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf3287ebafd6d08a41a00e0871eea16e4fbd1e0f3931730177d3e0ed995fd6a4c67a32ddc3949554f5dd06859807f2e309ba1e00c236d51e2e7fbf4f885e1d28
|
7
|
+
data.tar.gz: 3c93f26946552ad11888666f775c3e2658d44b330bda2e4057f21dda5ab3d657dcbbafb19a5151423e5484da8065226a841671413dd223b2860c25e418ac688d
|
data/justiz.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["mikep@quake.net"]
|
11
11
|
spec.description = %q{Extracts contact data.}
|
12
12
|
spec.summary = %q{Extract contact data from http://www.justizadressen.nrw.de/}
|
13
|
-
spec.homepage = ""
|
13
|
+
spec.homepage = "https://github.com/mike-park/justiz"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
data/lib/justiz/contact.rb
CHANGED
@@ -6,6 +6,16 @@ module Justiz
|
|
6
6
|
FIELDS = [:court, :location, :post, :phone, :fax, :justiz_id, :url, :email]
|
7
7
|
attr_accessor :attributes, *FIELDS
|
8
8
|
|
9
|
+
# remove from court if present
|
10
|
+
COURT_NOISE = [
|
11
|
+
' - Insolvenzabteilung -',
|
12
|
+
' - Handels- und Genossenschaftsregister -',
|
13
|
+
' - Vereinsregister -',
|
14
|
+
' - Partnerschaftsregister -',
|
15
|
+
' - Zwangsversteigerung/-verwaltung -',
|
16
|
+
' - Familiengericht -'
|
17
|
+
]
|
18
|
+
|
9
19
|
def initialize(attributes = {})
|
10
20
|
self.attributes = attributes
|
11
21
|
end
|
@@ -16,11 +26,16 @@ module Justiz
|
|
16
26
|
end
|
17
27
|
end
|
18
28
|
|
29
|
+
def court=(name)
|
30
|
+
@court = remove_court_noise(name)
|
31
|
+
end
|
32
|
+
|
19
33
|
def id
|
20
34
|
# too many duplicates
|
21
35
|
#[court, justiz_id].compact.join("")
|
22
|
-
#
|
23
|
-
[court, email].compact.join("")
|
36
|
+
# with noise removed also too many duplicates
|
37
|
+
#[court, email].compact.join("")
|
38
|
+
digest
|
24
39
|
end
|
25
40
|
|
26
41
|
def location_address
|
@@ -38,5 +53,17 @@ module Justiz
|
|
38
53
|
end
|
39
54
|
Digest.hexencode(sha256.digest)
|
40
55
|
end
|
56
|
+
|
57
|
+
def to_s
|
58
|
+
inspect
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def remove_court_noise(court)
|
64
|
+
court = court.dup
|
65
|
+
COURT_NOISE.each { |noise| court.sub!(/#{noise}$/, '') }
|
66
|
+
court
|
67
|
+
end
|
41
68
|
end
|
42
69
|
end
|
@@ -11,9 +11,10 @@ module Justiz
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def contacts
|
14
|
-
states.keys.map do |state|
|
14
|
+
contacts = states.keys.map do |state|
|
15
15
|
contacts_for(state)
|
16
16
|
end.flatten.compact
|
17
|
+
uniq_contacts(contacts)
|
17
18
|
end
|
18
19
|
|
19
20
|
def contacts_for(state)
|
@@ -21,17 +22,23 @@ module Justiz
|
|
21
22
|
return page.contacts unless page.limit_warning?
|
22
23
|
|
23
24
|
# do each type separately hoping to avoid limit warning
|
24
|
-
court_types.keys.map do |court_type|
|
25
|
+
contacts = court_types.keys.map do |court_type|
|
25
26
|
contacts_of_type(court_type, state)
|
26
|
-
end.flatten.compact
|
27
|
+
end.flatten.compact
|
28
|
+
uniq_contacts(contacts)
|
27
29
|
end
|
28
30
|
|
29
31
|
def contacts_of_type(type, state)
|
30
|
-
load_page(type, state, with_warning: true).contacts
|
32
|
+
contacts = load_page(type, state, with_warning: true).contacts
|
33
|
+
uniq_contacts(contacts)
|
31
34
|
end
|
32
35
|
|
33
36
|
private
|
34
37
|
|
38
|
+
def uniq_contacts(contacts)
|
39
|
+
contacts.uniq {|contact| contact.digest }
|
40
|
+
end
|
41
|
+
|
35
42
|
def home_page
|
36
43
|
@home_page ||= Page.new(agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw'))
|
37
44
|
end
|
@@ -69,7 +76,7 @@ module Justiz
|
|
69
76
|
end
|
70
77
|
|
71
78
|
def contacts
|
72
|
-
@contacts ||= parse_contacts
|
79
|
+
@contacts ||= parse_contacts.uniq
|
73
80
|
end
|
74
81
|
|
75
82
|
def parse_contacts
|
data/lib/justiz/version.rb
CHANGED
data/spec/lib/contact_spec.rb
CHANGED
@@ -34,4 +34,16 @@ describe Justiz::Contact do
|
|
34
34
|
digest = "76ef09d0c7d0078015df7a948cf0352c00f6451dab354389b21895a50d89a4a8"
|
35
35
|
expect(contact.digest).to eq(digest)
|
36
36
|
end
|
37
|
+
|
38
|
+
it "should remove - xxx - from court" do
|
39
|
+
court = "Something#{Justiz::Contact::COURT_NOISE.first}"
|
40
|
+
contact = Justiz::Contact.new(court: court)
|
41
|
+
expect(contact.court).to eq('Something')
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should not remove - xxx - from court" do
|
45
|
+
court = 'Something - Not be removed -'
|
46
|
+
contact = Justiz::Contact.new(court: court)
|
47
|
+
expect(contact.court).to eq(court)
|
48
|
+
end
|
37
49
|
end
|
@@ -73,7 +73,7 @@ describe Justiz::Scraper::Courts do
|
|
73
73
|
VCR.use_cassette 'courts/all_nrw' do
|
74
74
|
contacts = subject.contacts_for('NRW')
|
75
75
|
#ap contacts
|
76
|
-
expect(contacts.count).to eq(
|
76
|
+
expect(contacts.count).to eq(377)
|
77
77
|
end
|
78
78
|
end
|
79
79
|
end
|
@@ -83,16 +83,16 @@ describe Justiz::Scraper::Courts do
|
|
83
83
|
VCR.use_cassette 'courts/all_all' do
|
84
84
|
original = {
|
85
85
|
"BRD" => "Bundesgerichte/-behörden 12",
|
86
|
-
"BW" => "Baden-Württemberg
|
87
|
-
"BAY" => "Bayern
|
86
|
+
"BW" => "Baden-Württemberg 284",
|
87
|
+
"BAY" => "Bayern 263",
|
88
88
|
"B" => "Berlin 38",
|
89
89
|
"BRA" => "Brandenburg 64",
|
90
90
|
"BRE" => "Bremen 19",
|
91
91
|
"HH" => "Hamburg 35",
|
92
92
|
"HES" => "Hessen 115",
|
93
93
|
"MV" => "Mecklenburg-Vorpommern 55",
|
94
|
-
"NS" => "Niedersachsen
|
95
|
-
"NRW" => "Nordrhein-Westfalen
|
94
|
+
"NS" => "Niedersachsen 262",
|
95
|
+
"NRW" => "Nordrhein-Westfalen 377",
|
96
96
|
"RPF" => "Rheinland-Pfalz 101",
|
97
97
|
"SAA" => "Saarland 32",
|
98
98
|
"SAC" => "Sachsen 79",
|
@@ -108,7 +108,7 @@ describe Justiz::Scraper::Courts do
|
|
108
108
|
total += count
|
109
109
|
end
|
110
110
|
expect(states).to eq(original)
|
111
|
-
expect(total).to eq(
|
111
|
+
expect(total).to eq(1926)
|
112
112
|
end
|
113
113
|
end
|
114
114
|
end
|
@@ -200,6 +200,13 @@ describe Justiz::Scraper::Courts do
|
|
200
200
|
expect(contact.post_address).to be_a(Justiz::Address)
|
201
201
|
end
|
202
202
|
end
|
203
|
+
|
204
|
+
it "should rename court" do
|
205
|
+
VCR.use_cassette 'courts/zvg_nrw' do
|
206
|
+
contact = subject.contacts_of_type('ZVG', 'NRW').first
|
207
|
+
expect(contact.court).to_not match(/Zwangsversteigerung/)
|
208
|
+
end
|
209
|
+
end
|
203
210
|
end
|
204
211
|
end
|
205
212
|
|