justiz 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in justiz.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Mike Park
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Justiz
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'justiz'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install justiz
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/justiz ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'justiz'
4
+
5
+ Justiz::Cli.start
data/justiz.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'justiz/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "justiz"
8
+ spec.version = Justiz::VERSION
9
+ spec.authors = ["Mike Park"]
10
+ spec.email = ["mikep@quake.net"]
11
+ spec.description = %q{Extracts contact data.}
12
+ spec.summary = %q{Extract contact data from http://www.justizadressen.nrw.de/}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = ['justiz']
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "mechanize"
22
+ spec.add_dependency "thor"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec", "~> 2.6"
27
+ spec.add_development_dependency "awesome_print"
28
+ end
data/lib/justiz/cli.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'thor'
2
+
3
+ module Justiz
4
+ class Cli < Thor
5
+ desc "hello NAME", "say hello to NAME"
6
+ def hello(name)
7
+ puts "Hello #{name}"
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,4 @@
1
+ module Justiz
2
+ class Contact < Hash
3
+ end
4
+ end
@@ -0,0 +1,17 @@
1
+ require 'mechanize'
2
+ require 'logger'
3
+
4
+ module Justiz
5
+ module Scraper
6
+ class Agent < Mechanize
7
+ def initialize
8
+ super do |config|
9
+ config.default_encoding = 'UTF-8'
10
+ config.force_default_encoding = true
11
+ end
12
+ #self.log = Logger.new STDOUT
13
+ self.user_agent_alias = 'Mac Safari'
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,134 @@
1
+ module Justiz
2
+ module Scraper
3
+ class Courts
4
+
5
+ def court_types
6
+ select_options(home_page, 'gerausw')
7
+ end
8
+
9
+ def states
10
+ select_options(home_page, 'landausw')
11
+ end
12
+
13
+ def scrape(court_type, state)
14
+ page = load_page(court_type, state)
15
+ # if we reach limit on ALL, query each subtype
16
+ if court_type == 'ALL' && limit_warning?(page)
17
+ court_types.map do |court_type, description|
18
+ next if court_type == 'ALL'
19
+ page = load_page(court_type, state)
20
+ if limit_warning?(page)
21
+ puts(STDERR, "Warning: State #{state} has too many contacts of #{description}[#{court_type}]")
22
+ end
23
+ parse_page(page)
24
+ end.flatten.compact.uniq
25
+ else
26
+ parse_page(page)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def home_page
33
+ @home_page ||= agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw')
34
+ end
35
+
36
+ def load_page(court_type, state)
37
+ form = home_page.forms[2]
38
+ form['gerausw'] = court_type
39
+ form['landausw'] = state
40
+ agent.submit(form, form.buttons_with(name: 'suchen1').first)
41
+ end
42
+
43
+ def agent
44
+ @agent ||= Justiz::Scraper::Agent.new
45
+ end
46
+
47
+ def parse_page(page)
48
+ rows = page.search('tr').map { |tr| tr.search('td').to_a }
49
+ contact_rows = rows.find_all { |row| row.length == 3 }
50
+ contact_rows.map do |court, addresses, phones|
51
+ addresses = AddressTd.new(addresses)
52
+ phones = AddressTd.new(phones)
53
+
54
+ Justiz::Contact.new.merge court: court.text.strip,
55
+ location: addresses.lieferanschrift,
56
+ post: addresses.postfach,
57
+ phone: phones.telefone,
58
+ fax: phones.fax,
59
+ justiz_id: phones.justiz_id,
60
+ url: phones.url,
61
+ email: phones.email
62
+ end
63
+ end
64
+
65
+ def limit_warning?(page)
66
+ # avoid invalid UTF-8 errors by force encoding.
67
+ page.search('p').find {|p| p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i}
68
+ end
69
+
70
+ def select_options(page, name)
71
+ page.search("[name='#{name}'] > option").inject({}) do |memo, node|
72
+ memo[node['value']] = node.text
73
+ memo
74
+ end
75
+ end
76
+
77
+ class AddressTd
78
+ attr_reader :texts
79
+
80
+ def initialize(node)
81
+ nodes = node.children.to_a
82
+ @texts = nodes.map { |n| n.text.strip }.find_all { |t| !blank?(t) }
83
+ end
84
+
85
+ def telefone
86
+ same_line('Telefon:')
87
+ end
88
+
89
+ def fax
90
+ same_line('Fax:')
91
+ end
92
+
93
+ def justiz_id
94
+ same_line('XJustiz-ID:')
95
+ end
96
+
97
+ def lieferanschrift
98
+ next_line('Lieferanschrift')
99
+ end
100
+
101
+ def postfach
102
+ next_line('Postanschrift')
103
+ end
104
+
105
+ def url
106
+ next_line('URL')
107
+ end
108
+
109
+ def email
110
+ next_line('Mail')
111
+ end
112
+
113
+ private
114
+
115
+ def blank?(something)
116
+ something.to_s !~ /[^[:space:]]/
117
+ end
118
+
119
+ def next_line(name)
120
+ reg = Regexp.new(name, true)
121
+ line = texts.find_index { |text| text.match(reg) }
122
+ line && texts[line + 1]
123
+ end
124
+
125
+ def same_line(name)
126
+ reg = Regexp.new("#{name}(.*)", true)
127
+ text = texts.map { |t| t.match(reg) }.compact.first
128
+ text = text[1].strip if text
129
+ text if !blank?(text)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,3 @@
1
+ module Justiz
2
+ VERSION = "0.0.1"
3
+ end
data/lib/justiz.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "justiz/version"
2
+ require 'justiz/cli'
3
+ require 'justiz/scraper/agent'
4
+ require 'justiz/scraper/courts'
5
+ require 'justiz/contact'
6
+
7
+ module Justiz
8
+ end
@@ -0,0 +1,5 @@
1
+ require 'rspec'
2
+ require 'justiz'
3
+
4
+ describe Justiz::Contact do
5
+ end
@@ -0,0 +1,10 @@
1
+ require 'rspec'
2
+
3
+ describe 'My behaviour' do
4
+
5
+ it 'should do something' do
6
+
7
+ #To change this template use File | Settings | File Templates.
8
+ true.should == false
9
+ end
10
+ end
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rspec'
4
+ require 'justiz'
5
+ require 'awesome_print'
6
+
7
+ describe Justiz::Scraper::Courts do
8
+ it "should have court types" do
9
+ types = {
10
+ "ALL" => "-- alle Gerichte/Behörden --",
11
+ "AG" => "Amtsgerichte",
12
+ "LG" => "Landgerichte",
13
+ "OLG" => "Oberlandesgerichte",
14
+ "STA" => "Staatsanwaltschaften",
15
+ "MG" => "Mahngerichte",
16
+ "FAM" => "Familiengerichte",
17
+ "INS" => "Insolvenzgerichte",
18
+ "HREG" => "Handels-/Genossenschaftsregister",
19
+ "VREG" => "Vereinsregistergerichte",
20
+ "PREG" => "Partnerschaftsregistergerichte",
21
+ "ZVG" => "Zwangsversteigerungsgerichte",
22
+ "ZVK" => "Zentrale Vollstreckungsgerichte",
23
+ "SG" => "Sozialgerichte",
24
+ "VG" => "Verwaltungsgerichte",
25
+ "ARB" => "Arbeitsgerichte",
26
+ "FG" => "Finanzgerichte",
27
+ "VFG" => "Verfassungsgerichte",
28
+ "JM" => "Justizministerium",
29
+ "JVA" => "Vollzugseinrichtungen",
30
+ "AND" => "Sonstige Justizbehörden"
31
+ }
32
+ expect(subject.court_types).to eq(types)
33
+ end
34
+
35
+ it "should have states" do
36
+ types = {
37
+ "ALL" => "-- Auswahl über PLZ/Ort --",
38
+ "BRD" => "Bundesgerichte/-behörden",
39
+ "BW" => "Baden-Württemberg",
40
+ "BAY" => "Bayern",
41
+ "B" => "Berlin",
42
+ "BRA" => "Brandenburg",
43
+ "BRE" => "Bremen",
44
+ "HH" => "Hamburg",
45
+ "HES" => "Hessen",
46
+ "MV" => "Mecklenburg-Vorpommern",
47
+ "NS" => "Niedersachsen",
48
+ "NRW" => "Nordrhein-Westfalen",
49
+ "RPF" => "Rheinland-Pfalz",
50
+ "SAA" => "Saarland",
51
+ "SAC" => "Sachsen",
52
+ "SAH" => "Sachsen-Anhalt",
53
+ "SH" => "Schleswig-Holstein",
54
+ "TH" => "Thüringen"
55
+ }
56
+ expect(subject.states).to eq(types)
57
+ end
58
+
59
+ context "Bundesgerichte" do
60
+ it "should find all Bundesgerichte" do
61
+ contacts = subject.scrape('ALL', 'BRD')
62
+ #ap contacts
63
+ expect(contacts.count).to eq(12)
64
+ end
65
+ end
66
+
67
+
68
+ context "NRW" do
69
+ it "should find all NRW" do
70
+ contacts = subject.scrape('ALL', 'NRW')
71
+ #ap contacts
72
+ expect(contacts.count).to eq(513)
73
+ end
74
+ end
75
+
76
+ context "search all" do
77
+ it "should find all entries" do
78
+ # as of 13.6.2013
79
+ original = {
80
+ "ALL" => "-- Auswahl über PLZ/Ort -- 0",
81
+ "BRD" => "Bundesgerichte/-behörden 12",
82
+ "BW" => "Baden-Württemberg 430",
83
+ "BAY" => "Bayern 348",
84
+ "B" => "Berlin 38",
85
+ "BRA" => "Brandenburg 64",
86
+ "BRE" => "Bremen 19",
87
+ "HH" => "Hamburg 35",
88
+ "HES" => "Hessen 115",
89
+ "MV" => "Mecklenburg-Vorpommern 55",
90
+ "NS" => "Niedersachsen 305",
91
+ "NRW" => "Nordrhein-Westfalen 513",
92
+ "RPF" => "Rheinland-Pfalz 101",
93
+ "SAA" => "Saarland 32",
94
+ "SAC" => "Sachsen 79",
95
+ "SAH" => "Sachsen-Anhalt 69",
96
+ "SH" => "Schleswig-Holstein 58",
97
+ "TH" => "Thüringen 63"
98
+ }
99
+ states = subject.states
100
+ states.keys.each do |state|
101
+ count = subject.scrape('ALL', state).count
102
+ states[state] += " #{count}"
103
+ end
104
+ expect(states).to eq(original)
105
+ end
106
+ end
107
+ end
108
+
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: justiz
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Mike Park
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thor
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.3'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: '2.6'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: '2.6'
94
+ - !ruby/object:Gem::Dependency
95
+ name: awesome_print
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Extracts contact data.
111
+ email:
112
+ - mikep@quake.net
113
+ executables:
114
+ - justiz
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - bin/justiz
124
+ - justiz.gemspec
125
+ - lib/justiz.rb
126
+ - lib/justiz/cli.rb
127
+ - lib/justiz/contact.rb
128
+ - lib/justiz/scraper/agent.rb
129
+ - lib/justiz/scraper/courts.rb
130
+ - lib/justiz/version.rb
131
+ - spec/lib/contact_spec.rb
132
+ - spec/lib/justiz_spec.rb
133
+ - spec/lib/scraper/courts_spec.rb
134
+ homepage: ''
135
+ licenses:
136
+ - MIT
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ none: false
143
+ requirements:
144
+ - - ! '>='
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ required_rubygems_version: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ! '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ requirements: []
154
+ rubyforge_project:
155
+ rubygems_version: 1.8.25
156
+ signing_key:
157
+ specification_version: 3
158
+ summary: Extract contact data from http://www.justizadressen.nrw.de/
159
+ test_files:
160
+ - spec/lib/contact_spec.rb
161
+ - spec/lib/justiz_spec.rb
162
+ - spec/lib/scraper/courts_spec.rb