justiz 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in justiz.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Mike Park
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Justiz
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'justiz'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install justiz
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/justiz ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'justiz'
4
+
5
+ Justiz::Cli.start
data/justiz.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'justiz/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "justiz"
8
+ spec.version = Justiz::VERSION
9
+ spec.authors = ["Mike Park"]
10
+ spec.email = ["mikep@quake.net"]
11
+ spec.description = %q{Extracts contact data.}
12
+ spec.summary = %q{Extract contact data from http://www.justizadressen.nrw.de/}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = ['justiz']
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "mechanize"
22
+ spec.add_dependency "thor"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec", "~> 2.6"
27
+ spec.add_development_dependency "awesome_print"
28
+ end
data/lib/justiz/cli.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'thor'
2
+
3
+ module Justiz
4
+ class Cli < Thor
5
+ desc "hello NAME", "say hello to NAME"
6
+ def hello(name)
7
+ puts "Hello #{name}"
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,4 @@
1
+ module Justiz
2
+ class Contact < Hash
3
+ end
4
+ end
@@ -0,0 +1,17 @@
1
+ require 'mechanize'
2
+ require 'logger'
3
+
4
+ module Justiz
5
+ module Scraper
6
+ class Agent < Mechanize
7
+ def initialize
8
+ super do |config|
9
+ config.default_encoding = 'UTF-8'
10
+ config.force_default_encoding = true
11
+ end
12
+ #self.log = Logger.new STDOUT
13
+ self.user_agent_alias = 'Mac Safari'
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,134 @@
1
+ module Justiz
2
+ module Scraper
3
+ class Courts
4
+
5
+ def court_types
6
+ select_options(home_page, 'gerausw')
7
+ end
8
+
9
+ def states
10
+ select_options(home_page, 'landausw')
11
+ end
12
+
13
+ def scrape(court_type, state)
14
+ page = load_page(court_type, state)
15
+ # if we reach limit on ALL, query each subtype
16
+ if court_type == 'ALL' && limit_warning?(page)
17
+ court_types.map do |court_type, description|
18
+ next if court_type == 'ALL'
19
+ page = load_page(court_type, state)
20
+ if limit_warning?(page)
21
+ puts(STDERR, "Warning: State #{state} has too many contacts of #{description}[#{court_type}]")
22
+ end
23
+ parse_page(page)
24
+ end.flatten.compact.uniq
25
+ else
26
+ parse_page(page)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def home_page
33
+ @home_page ||= agent.get('http://www.justizadressen.nrw.de/og.php?MD=nrw')
34
+ end
35
+
36
+ def load_page(court_type, state)
37
+ form = home_page.forms[2]
38
+ form['gerausw'] = court_type
39
+ form['landausw'] = state
40
+ agent.submit(form, form.buttons_with(name: 'suchen1').first)
41
+ end
42
+
43
+ def agent
44
+ @agent ||= Justiz::Scraper::Agent.new
45
+ end
46
+
47
+ def parse_page(page)
48
+ rows = page.search('tr').map { |tr| tr.search('td').to_a }
49
+ contact_rows = rows.find_all { |row| row.length == 3 }
50
+ contact_rows.map do |court, addresses, phones|
51
+ addresses = AddressTd.new(addresses)
52
+ phones = AddressTd.new(phones)
53
+
54
+ Justiz::Contact.new.merge court: court.text.strip,
55
+ location: addresses.lieferanschrift,
56
+ post: addresses.postfach,
57
+ phone: phones.telefone,
58
+ fax: phones.fax,
59
+ justiz_id: phones.justiz_id,
60
+ url: phones.url,
61
+ email: phones.email
62
+ end
63
+ end
64
+
65
+ def limit_warning?(page)
66
+ # avoid invalid UTF-8 errors by force encoding.
67
+ page.search('p').find {|p| p.text.force_encoding("ISO-8859-15") =~ /Ihre Suchanfrage ergab mehr als/i}
68
+ end
69
+
70
+ def select_options(page, name)
71
+ page.search("[name='#{name}'] > option").inject({}) do |memo, node|
72
+ memo[node['value']] = node.text
73
+ memo
74
+ end
75
+ end
76
+
77
+ class AddressTd
78
+ attr_reader :texts
79
+
80
+ def initialize(node)
81
+ nodes = node.children.to_a
82
+ @texts = nodes.map { |n| n.text.strip }.find_all { |t| !blank?(t) }
83
+ end
84
+
85
+ def telefone
86
+ same_line('Telefon:')
87
+ end
88
+
89
+ def fax
90
+ same_line('Fax:')
91
+ end
92
+
93
+ def justiz_id
94
+ same_line('XJustiz-ID:')
95
+ end
96
+
97
+ def lieferanschrift
98
+ next_line('Lieferanschrift')
99
+ end
100
+
101
+ def postfach
102
+ next_line('Postanschrift')
103
+ end
104
+
105
+ def url
106
+ next_line('URL')
107
+ end
108
+
109
+ def email
110
+ next_line('Mail')
111
+ end
112
+
113
+ private
114
+
115
+ def blank?(something)
116
+ something.to_s !~ /[^[:space:]]/
117
+ end
118
+
119
+ def next_line(name)
120
+ reg = Regexp.new(name, true)
121
+ line = texts.find_index { |text| text.match(reg) }
122
+ line && texts[line + 1]
123
+ end
124
+
125
+ def same_line(name)
126
+ reg = Regexp.new("#{name}(.*)", true)
127
+ text = texts.map { |t| t.match(reg) }.compact.first
128
+ text = text[1].strip if text
129
+ text if !blank?(text)
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,3 @@
1
+ module Justiz
2
+ VERSION = "0.0.1"
3
+ end
data/lib/justiz.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "justiz/version"
2
+ require 'justiz/cli'
3
+ require 'justiz/scraper/agent'
4
+ require 'justiz/scraper/courts'
5
+ require 'justiz/contact'
6
+
7
+ module Justiz
8
+ end
@@ -0,0 +1,5 @@
1
+ require 'rspec'
2
+ require 'justiz'
3
+
4
+ describe Justiz::Contact do
5
+ end
@@ -0,0 +1,10 @@
1
+ require 'rspec'
2
+
3
+ describe 'My behaviour' do
4
+
5
+ it 'should do something' do
6
+
7
+ #To change this template use File | Settings | File Templates.
8
+ true.should == false
9
+ end
10
+ end
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rspec'
4
+ require 'justiz'
5
+ require 'awesome_print'
6
+
7
+ describe Justiz::Scraper::Courts do
8
+ it "should have court types" do
9
+ types = {
10
+ "ALL" => "-- alle Gerichte/Behörden --",
11
+ "AG" => "Amtsgerichte",
12
+ "LG" => "Landgerichte",
13
+ "OLG" => "Oberlandesgerichte",
14
+ "STA" => "Staatsanwaltschaften",
15
+ "MG" => "Mahngerichte",
16
+ "FAM" => "Familiengerichte",
17
+ "INS" => "Insolvenzgerichte",
18
+ "HREG" => "Handels-/Genossenschaftsregister",
19
+ "VREG" => "Vereinsregistergerichte",
20
+ "PREG" => "Partnerschaftsregistergerichte",
21
+ "ZVG" => "Zwangsversteigerungsgerichte",
22
+ "ZVK" => "Zentrale Vollstreckungsgerichte",
23
+ "SG" => "Sozialgerichte",
24
+ "VG" => "Verwaltungsgerichte",
25
+ "ARB" => "Arbeitsgerichte",
26
+ "FG" => "Finanzgerichte",
27
+ "VFG" => "Verfassungsgerichte",
28
+ "JM" => "Justizministerium",
29
+ "JVA" => "Vollzugseinrichtungen",
30
+ "AND" => "Sonstige Justizbehörden"
31
+ }
32
+ expect(subject.court_types).to eq(types)
33
+ end
34
+
35
+ it "should have states" do
36
+ types = {
37
+ "ALL" => "-- Auswahl über PLZ/Ort --",
38
+ "BRD" => "Bundesgerichte/-behörden",
39
+ "BW" => "Baden-Württemberg",
40
+ "BAY" => "Bayern",
41
+ "B" => "Berlin",
42
+ "BRA" => "Brandenburg",
43
+ "BRE" => "Bremen",
44
+ "HH" => "Hamburg",
45
+ "HES" => "Hessen",
46
+ "MV" => "Mecklenburg-Vorpommern",
47
+ "NS" => "Niedersachsen",
48
+ "NRW" => "Nordrhein-Westfalen",
49
+ "RPF" => "Rheinland-Pfalz",
50
+ "SAA" => "Saarland",
51
+ "SAC" => "Sachsen",
52
+ "SAH" => "Sachsen-Anhalt",
53
+ "SH" => "Schleswig-Holstein",
54
+ "TH" => "Thüringen"
55
+ }
56
+ expect(subject.states).to eq(types)
57
+ end
58
+
59
+ context "Bundesgerichte" do
60
+ it "should find all Bundesgerichte" do
61
+ contacts = subject.scrape('ALL', 'BRD')
62
+ #ap contacts
63
+ expect(contacts.count).to eq(12)
64
+ end
65
+ end
66
+
67
+
68
+ context "NRW" do
69
+ it "should find all NRW" do
70
+ contacts = subject.scrape('ALL', 'NRW')
71
+ #ap contacts
72
+ expect(contacts.count).to eq(513)
73
+ end
74
+ end
75
+
76
+ context "search all" do
77
+ it "should find all entries" do
78
+ # as of 13.6.2013
79
+ original = {
80
+ "ALL" => "-- Auswahl über PLZ/Ort -- 0",
81
+ "BRD" => "Bundesgerichte/-behörden 12",
82
+ "BW" => "Baden-Württemberg 430",
83
+ "BAY" => "Bayern 348",
84
+ "B" => "Berlin 38",
85
+ "BRA" => "Brandenburg 64",
86
+ "BRE" => "Bremen 19",
87
+ "HH" => "Hamburg 35",
88
+ "HES" => "Hessen 115",
89
+ "MV" => "Mecklenburg-Vorpommern 55",
90
+ "NS" => "Niedersachsen 305",
91
+ "NRW" => "Nordrhein-Westfalen 513",
92
+ "RPF" => "Rheinland-Pfalz 101",
93
+ "SAA" => "Saarland 32",
94
+ "SAC" => "Sachsen 79",
95
+ "SAH" => "Sachsen-Anhalt 69",
96
+ "SH" => "Schleswig-Holstein 58",
97
+ "TH" => "Thüringen 63"
98
+ }
99
+ states = subject.states
100
+ states.keys.each do |state|
101
+ count = subject.scrape('ALL', state).count
102
+ states[state] += " #{count}"
103
+ end
104
+ expect(states).to eq(original)
105
+ end
106
+ end
107
+ end
108
+
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: justiz
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Mike Park
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thor
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.3'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rspec
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: '2.6'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: '2.6'
94
+ - !ruby/object:Gem::Dependency
95
+ name: awesome_print
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Extracts contact data.
111
+ email:
112
+ - mikep@quake.net
113
+ executables:
114
+ - justiz
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - bin/justiz
124
+ - justiz.gemspec
125
+ - lib/justiz.rb
126
+ - lib/justiz/cli.rb
127
+ - lib/justiz/contact.rb
128
+ - lib/justiz/scraper/agent.rb
129
+ - lib/justiz/scraper/courts.rb
130
+ - lib/justiz/version.rb
131
+ - spec/lib/contact_spec.rb
132
+ - spec/lib/justiz_spec.rb
133
+ - spec/lib/scraper/courts_spec.rb
134
+ homepage: ''
135
+ licenses:
136
+ - MIT
137
+ post_install_message:
138
+ rdoc_options: []
139
+ require_paths:
140
+ - lib
141
+ required_ruby_version: !ruby/object:Gem::Requirement
142
+ none: false
143
+ requirements:
144
+ - - ! '>='
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ required_rubygems_version: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ! '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ requirements: []
154
+ rubyforge_project:
155
+ rubygems_version: 1.8.25
156
+ signing_key:
157
+ specification_version: 3
158
+ summary: Extract contact data from http://www.justizadressen.nrw.de/
159
+ test_files:
160
+ - spec/lib/contact_spec.rb
161
+ - spec/lib/justiz_spec.rb
162
+ - spec/lib/scraper/courts_spec.rb