lobbyliste 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +9 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +76 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/lobbyliste +7 -0
- data/bin/setup +8 -0
- data/ext/pdfbox.jar +0 -0
- data/lib/lobbyliste/address.rb +79 -0
- data/lib/lobbyliste/core_ext/string.rb +5 -0
- data/lib/lobbyliste/downloader.rb +71 -0
- data/lib/lobbyliste/factories/address_factory.rb +129 -0
- data/lib/lobbyliste/factories/list_factory.rb +179 -0
- data/lib/lobbyliste/factories/organisation_factory.rb +113 -0
- data/lib/lobbyliste/factories/person_factory.rb +70 -0
- data/lib/lobbyliste/factories.rb +9 -0
- data/lib/lobbyliste/list.rb +34 -0
- data/lib/lobbyliste/organisation.rb +71 -0
- data/lib/lobbyliste/person.rb +27 -0
- data/lib/lobbyliste/version.rb +3 -0
- data/lib/lobbyliste.rb +19 -0
- data/lobbyliste.gemspec +30 -0
- metadata +170 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a4ec8ab1a342db01916730a29a6061d70dd9e4f0
|
4
|
+
data.tar.gz: b65292668e6d0256df3c1b517a84157838a1a0c8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d119a69036498b8e8d124bb1a1a6244a5904ffb9576c68ae08afe7078318601bb6040678ba9a0598bf223f0dc9265bc157bb366e9eb1a6b46e93d798edd69910
|
7
|
+
data.tar.gz: fd4984edf6a3ef34632ca714911a871a52ea5160858188dc7c235e7fb7fdab682827f16ce1e7691ce359525cd63f77f8f86b37f73170009a92e039c8eb02f3b5
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, and in the interest of
|
4
|
+
fostering an open and welcoming community, we pledge to respect all people who
|
5
|
+
contribute through reporting issues, posting feature requests, updating
|
6
|
+
documentation, submitting pull requests or patches, and other activities.
|
7
|
+
|
8
|
+
We are committed to making participation in this project a harassment-free
|
9
|
+
experience for everyone, regardless of level of experience, gender, gender
|
10
|
+
identity and expression, sexual orientation, disability, personal appearance,
|
11
|
+
body size, race, ethnicity, age, religion, or nationality.
|
12
|
+
|
13
|
+
Examples of unacceptable behavior by participants include:
|
14
|
+
|
15
|
+
* The use of sexualized language or imagery
|
16
|
+
* Personal attacks
|
17
|
+
* Trolling or insulting/derogatory comments
|
18
|
+
* Public or private harassment
|
19
|
+
* Publishing other's private information, such as physical or electronic
|
20
|
+
addresses, without explicit permission
|
21
|
+
* Other unethical or unprofessional conduct
|
22
|
+
|
23
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
24
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
25
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
26
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
27
|
+
threatening, offensive, or harmful.
|
28
|
+
|
29
|
+
By adopting this Code of Conduct, project maintainers commit themselves to
|
30
|
+
fairly and consistently applying these principles to every aspect of managing
|
31
|
+
this project. Project maintainers who do not follow or enforce the Code of
|
32
|
+
Conduct may be permanently removed from the project team.
|
33
|
+
|
34
|
+
This code of conduct applies both within project spaces and in public spaces
|
35
|
+
when an individual is representing the project or its community.
|
36
|
+
|
37
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
38
|
+
reported by contacting a project maintainer at max@kopfueber.org. All
|
39
|
+
complaints will be reviewed and investigated and will result in a response that
|
40
|
+
is deemed necessary and appropriate to the circumstances. Maintainers are
|
41
|
+
obligated to maintain confidentiality with regard to the reporter of an
|
42
|
+
incident.
|
43
|
+
|
44
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
45
|
+
version 1.3.0, available at
|
46
|
+
[http://contributor-covenant.org/version/1/3/0/][version]
|
47
|
+
|
48
|
+
[homepage]: http://contributor-covenant.org
|
49
|
+
[version]: http://contributor-covenant.org/version/1/3/0/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 DarthMax
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
# Lobbyliste
|
2
|
+
|
3
|
+
[](https://travis-ci.org/FHG-IMW/lobbyliste) [](https://inch-ci.org/github/FHG-IMW/lobbyliste/suggestions?branch=master)
|
4
|
+
|
5
|
+
This gem crawls and parses the the list of lobbyists which is published as a PDF by the German Bundestag.
|
6
|
+
Our goal is to provide a simple and easy to maintain parser.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
gem 'lobbyliste'
|
14
|
+
```
|
15
|
+
|
16
|
+
And then execute:
|
17
|
+
|
18
|
+
$ bundle
|
19
|
+
|
20
|
+
Or install it yourself as:
|
21
|
+
|
22
|
+
$ gem install lobbyliste
|
23
|
+
|
24
|
+
*NOTE: This gem requires JAVA to be installed. We use [PDFBox](https://pdfbox.apache.org/) for PDF extraction as this currently seems to be the best alternative*
|
25
|
+
|
26
|
+
## Usage
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
require 'lobbyliste'
|
30
|
+
|
31
|
+
list = Lobbyliste.fetch_and_parse
|
32
|
+
organisation = list.organisations.first
|
33
|
+
|
34
|
+
organisation.name #=> 1219. Deutsche Stiftung für interreligiösen und interkulturellen Dialog e. V.
|
35
|
+
|
36
|
+
organisation.people.map {|person| person.name} #=> ["Claudius Groß", "Markus Hoymann", "Thomas M. Schimmel"]
|
37
|
+
|
38
|
+
organisation.tags #=> ["Kultur", "Religion"]
|
39
|
+
|
40
|
+
organisation.abbreviations #=> []
|
41
|
+
|
42
|
+
address = organisation.address
|
43
|
+
puts address.full_address
|
44
|
+
# 1219. Deutsche Stiftung für interreligiösen und #interkulturellen Dialog e. V.
|
45
|
+
# Hinter der katholischen Kirche 3
|
46
|
+
# 10117 Berlin
|
47
|
+
# Deutschland
|
48
|
+
# Tel: +4930 51057773
|
49
|
+
# Fax: +4930 51057785
|
50
|
+
# Email: schimmel@1219.eu
|
51
|
+
# http://www.1219.eu
|
52
|
+
```
|
53
|
+
|
54
|
+
### CLI
|
55
|
+
|
56
|
+
You can also use this gem on your comandline. It will dump the complete list as JSON
|
57
|
+
|
58
|
+
For example to create a gziped json file run:
|
59
|
+
|
60
|
+
```bash
|
61
|
+
$ lobbyliste | gzip > lobbyliste.json.gz
|
62
|
+
```
|
63
|
+
|
64
|
+
|
65
|
+
# Special Thanks
|
66
|
+
|
67
|
+
- [Sebastian Vollnhals (@yetzt)](https://github.com/yetzt) - for his excellent node based scraper for the lobbyliste (https://github.com/yetzt/scraper-lobbyliste) from which many lines were reused.
|
68
|
+
|
69
|
+
# Contributing
|
70
|
+
|
71
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/lobbyliste. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
72
|
+
|
73
|
+
|
74
|
+
## License
|
75
|
+
|
76
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "lobbyliste"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/lobbyliste
ADDED
data/bin/setup
ADDED
data/ext/pdfbox.jar
ADDED
Binary file
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
|
3
|
+
# This class represents addresses found in the lobbylist.
|
4
|
+
class Address
|
5
|
+
# @return [String] organisation name (the bold part)
|
6
|
+
attr_reader :name
|
7
|
+
|
8
|
+
# @return [String] Everything that is not part of the name or any other field
|
9
|
+
attr_reader :address
|
10
|
+
|
11
|
+
# @return [String] Postcode
|
12
|
+
attr_reader :postcode
|
13
|
+
|
14
|
+
# @return [String] City
|
15
|
+
attr_reader :city
|
16
|
+
|
17
|
+
# @return [String] the country, default: "Germany"
|
18
|
+
attr_reader :country
|
19
|
+
|
20
|
+
# @return [String] the telephone number if given (german numbers are automatically prefixed with +49)
|
21
|
+
attr_reader :tel
|
22
|
+
|
23
|
+
# @return [String] the fax number if given (german numbers are automatically prefixed with +49)
|
24
|
+
attr_reader :fax
|
25
|
+
|
26
|
+
# @return [String] website url
|
27
|
+
attr_reader :website
|
28
|
+
|
29
|
+
# @return [String] contact email address
|
30
|
+
attr_reader :email
|
31
|
+
|
32
|
+
# @return [Symbol] address type, :primary for 1. address, :secondary for all others
|
33
|
+
attr_reader :type
|
34
|
+
|
35
|
+
def initialize(name, address, postcode, city, country, tel, fax, website, email, type)
|
36
|
+
@name = name
|
37
|
+
@address = address
|
38
|
+
@postcode = postcode
|
39
|
+
@city = city
|
40
|
+
@country = country
|
41
|
+
@tel = tel
|
42
|
+
@fax = fax
|
43
|
+
@website = website
|
44
|
+
@email = email
|
45
|
+
@type=type
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return String pretty formated address of all existing address fields
|
49
|
+
def full_address
|
50
|
+
full_address = [
|
51
|
+
@name,
|
52
|
+
@address,
|
53
|
+
[@postcode,@city].reject(&:nil?).join(" "),
|
54
|
+
@country,
|
55
|
+
]
|
56
|
+
|
57
|
+
full_address << "Tel: #{@tel}" if @tel
|
58
|
+
full_address << "Fax: #{@fax}" if @fax
|
59
|
+
full_address << "Email: #{@email}" if @email
|
60
|
+
full_address << @website if @website
|
61
|
+
full_address.reject(&:nil?).join("\n")
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_json(*a)
|
65
|
+
{
|
66
|
+
name: name,
|
67
|
+
address: address,
|
68
|
+
postcode: postcode,
|
69
|
+
city: city,
|
70
|
+
country: country,
|
71
|
+
tel: tel,
|
72
|
+
fax: fax,
|
73
|
+
email: email,
|
74
|
+
website: website,
|
75
|
+
type: type.to_s
|
76
|
+
}.to_json(*a)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Lobbyliste
|
5
|
+
|
6
|
+
# This class finds the lobbyliste pdf on the Bundestag website, downloads it and extracts the pdf content
|
7
|
+
class Downloader
|
8
|
+
|
9
|
+
# @return [String] raw content of pdf file
|
10
|
+
def pdf_data
|
11
|
+
retrieve_pdf unless @pdf_data
|
12
|
+
@pdf_data
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
# @return [String] extracted content of pdf file
|
17
|
+
def text_data
|
18
|
+
extract_pdf unless @text_data
|
19
|
+
@text_data
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [String] extracted content of pdf file in html format
|
23
|
+
def html_data
|
24
|
+
extract_pdf unless @html_data
|
25
|
+
@html_data
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# Since this link changes with every new version we download the Lobbyliste website and try to extract the link
|
31
|
+
# @return [String] the link to the Lobbyliste pdf
|
32
|
+
def pdf_link
|
33
|
+
website = Nokogiri::HTML(open("https://www.bundestag.de/dokumente/lobbyliste"))
|
34
|
+
link = website.css(".inhalt a[title^='Aktuelle Fassung']").first
|
35
|
+
|
36
|
+
raise "Could no find PDF link on the website!" unless link
|
37
|
+
"https://bundestag.de#{link['href']}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def retrieve_pdf
|
41
|
+
@pdf_data = open(pdf_link) {|f| f.read}
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
def extract_pdf
|
46
|
+
pdf_file = Tempfile.new(["lobbyliste",".pdf"])
|
47
|
+
pdf_file.write(pdf_data)
|
48
|
+
pdf_file.rewind
|
49
|
+
|
50
|
+
@text_data = run_extraction(pdf_file)
|
51
|
+
@html_data = run_extraction(pdf_file,true)
|
52
|
+
ensure
|
53
|
+
pdf_file.close
|
54
|
+
pdf_file.unlink
|
55
|
+
end
|
56
|
+
|
57
|
+
def run_extraction(pdf_file,html=false)
|
58
|
+
tmp_file = Tempfile.new(["lobbyliste"])
|
59
|
+
status = system("/usr/bin/java -jar #{jar_path} ExtractText #{pdf_file.path} #{html ? "-html":""} #{tmp_file.path} > /dev/null 2>&1")
|
60
|
+
raise "PDF extraction failed" unless status
|
61
|
+
return tmp_file.read
|
62
|
+
ensure
|
63
|
+
tmp_file.close
|
64
|
+
tmp_file.unlink
|
65
|
+
end
|
66
|
+
|
67
|
+
def jar_path
|
68
|
+
File.join(File.dirname(File.expand_path(__FILE__)), '../../ext/pdfbox.jar')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
module Factories
|
3
|
+
# This class is used to build an address from raw data
|
4
|
+
# Since it is to hard to separate the na,e and address data without markup,
|
5
|
+
# we use the html data to accomplish that
|
6
|
+
class AddressFactory
|
7
|
+
|
8
|
+
# @return [Lobbyliste::Address]
|
9
|
+
def self.build(name,raw_data, type=:primary)
|
10
|
+
factory = new(name,raw_data,type)
|
11
|
+
::Lobbyliste::Address.new(
|
12
|
+
factory.name,
|
13
|
+
factory.address,
|
14
|
+
factory.postcode,
|
15
|
+
factory.city,
|
16
|
+
factory.country,
|
17
|
+
factory.tel,
|
18
|
+
factory.fax,
|
19
|
+
factory.website,
|
20
|
+
factory.email,
|
21
|
+
factory.type
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :name, :tel, :fax, :website, :email, :country, :postcode, :city, :type
|
26
|
+
|
27
|
+
def initialize(name,raw_data,type=:primary)
|
28
|
+
@name = name
|
29
|
+
@raw_data = raw_data
|
30
|
+
|
31
|
+
@address = []
|
32
|
+
@tel = nil
|
33
|
+
@fax = nil
|
34
|
+
@website = nil
|
35
|
+
@email = nil
|
36
|
+
@country = "Deutschland"
|
37
|
+
@postcode = nil
|
38
|
+
@city = nil
|
39
|
+
@type=type
|
40
|
+
|
41
|
+
parse
|
42
|
+
end
|
43
|
+
|
44
|
+
def parse
|
45
|
+
@raw_data.each_with_index do |line,i|
|
46
|
+
case label(line,i)
|
47
|
+
when :addr then @address << line
|
48
|
+
when :tel then extract_tel_fax(line)
|
49
|
+
when :postcode then extract_postcode_city(line)
|
50
|
+
when :email then extract_email(line)
|
51
|
+
when :website then extract_website(line)
|
52
|
+
when :country then @country = line
|
53
|
+
else next
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def address
|
59
|
+
@address.join(", ")
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def label(line,i)
|
65
|
+
# this line is part of the name
|
66
|
+
return :name if name.include?(line)
|
67
|
+
|
68
|
+
return :tel if line.match(/^(Tel\.|Fax): /)
|
69
|
+
return :email if line.match(/^E\-Mail\: /)
|
70
|
+
return :website if line.match(/^Internet\:/)
|
71
|
+
|
72
|
+
# international postcodes
|
73
|
+
return :postcode if line.match(/^\d{5,7}\s(.+)$/)
|
74
|
+
# UK postcodes
|
75
|
+
return :postcode if line.match(/^([A-Z0-9]{3}\s?[A-Z0-9]{3})$/)
|
76
|
+
|
77
|
+
# if the line looks like an address
|
78
|
+
return :addr if line.match(/(c\/o|^postfach\b|[Ss]tr(aße|\.)?\b|[Aa]llee\b|[Pp]latz\b|[Gg]asse\b|[Ww]eg\b|\b([0-9]+\-)?[0-9]+\s?[a-zA-Z]*$|[Vv]orstand|[Ss]ekretär|[Gg]eschäfts)/)
|
79
|
+
# if the previous line ended with e.V. the next line is addr
|
80
|
+
return :addr if @raw_data[i-1].match(/(e\.\s?V\.|\([A-Z]+\)$)/)
|
81
|
+
|
82
|
+
return :country if %w(Niederlande Belgien Schweiz Luxemburg Dänemark Österreich Tschechien Polen USA Israel Russland).include?(line)
|
83
|
+
# a single Word with a capital letter is probably a country
|
84
|
+
return :country if i > 3 && line.match(/^[A-Z][a-zA-Zöüä?]+$/)
|
85
|
+
return :country if line == "Vereinigtes Königreich"
|
86
|
+
|
87
|
+
:addr
|
88
|
+
end
|
89
|
+
|
90
|
+
def extract_tel_fax(line)
|
91
|
+
_tel = line.match(/Tel\.\: ((\(?\d+\)? )?(\d+\s?)+\d+)/)
|
92
|
+
if _tel
|
93
|
+
@tel = _tel[1].gsub(/[\(|\)]/,"").gsub(/^0{1,2}/,"+49")
|
94
|
+
end
|
95
|
+
|
96
|
+
_fax = line.match(/Fax\: ((\(?\d+\)? )?(\d+\s?)+\d+)/)
|
97
|
+
if _fax
|
98
|
+
@fax = _fax[1].gsub(/[\(|\)]/,"").gsub(/^0{1,2}/,"+49")
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_postcode_city(line)
|
103
|
+
_postcode_city = line.match(/(\d{5,7})\s?(.+)?/)
|
104
|
+
if _postcode_city
|
105
|
+
@postcode = _postcode_city[1]
|
106
|
+
@city = _postcode_city[2]
|
107
|
+
else
|
108
|
+
_uk_postcode = line.match(/^([A-Z0-9]{3}\s?[A-Z0-9]{3})$/)
|
109
|
+
@postcode = _uk_postcode[1] if _uk_postcode
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def extract_website(line)
|
114
|
+
_website = line.match(/^Internet\:\s?(.+)$/)
|
115
|
+
if _website
|
116
|
+
@website = _website[1]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def extract_email(line)
|
121
|
+
_email = line.match(/^E\-Mail\:\s?(.+)$/)
|
122
|
+
if _email
|
123
|
+
@email = _email[1]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
module Factories
|
3
|
+
# This class is used to build the list from raw data
|
4
|
+
class ListFactory
|
5
|
+
attr_reader :data
|
6
|
+
|
7
|
+
# @return [Lobbyliste::List]
|
8
|
+
def self.build(text_data,html_data)
|
9
|
+
factory = new(text_data,html_data)
|
10
|
+
::Lobbyliste::List.new(
|
11
|
+
factory.organisations,
|
12
|
+
factory.tags,
|
13
|
+
factory.abbreviations,
|
14
|
+
factory.last_update
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize(text_data,html_data)
|
19
|
+
@text_data = text_data
|
20
|
+
@html_data = html_data
|
21
|
+
|
22
|
+
@lines = text_data.each_line.to_a.map(&:chomp)
|
23
|
+
|
24
|
+
@organisations = nil
|
25
|
+
@tags = nil
|
26
|
+
@abbreviations = nil
|
27
|
+
@names = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def organisations
|
31
|
+
return @organisations if @organisations
|
32
|
+
|
33
|
+
@organisations = organisations_data.map do |organisation_data|
|
34
|
+
id = organisation_data[0].to_i
|
35
|
+
name = names[id]
|
36
|
+
tags = tags_for_organisation(id)
|
37
|
+
abbreviations = abbreviations_for_organisation(id)
|
38
|
+
::Lobbyliste::Factories::OrganisationFactory.build(name,organisation_data,tags,abbreviations)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def names
|
43
|
+
extract_names unless @names
|
44
|
+
@names
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
def tags
|
49
|
+
return @tags if @tags
|
50
|
+
|
51
|
+
tags = Hash.new{|h,k| h[k] = []}
|
52
|
+
tag_data = extract_tag_data
|
53
|
+
|
54
|
+
current_tag = "A"
|
55
|
+
tag_data.each do |line|
|
56
|
+
if line.match(/^[A-ZÄÖÜ][a-zäöüß]+$/) && [current_tag[0],current_tag[0].next].include?(line[0])
|
57
|
+
current_tag = line
|
58
|
+
elsif line.match(/^\– \d+/)
|
59
|
+
id = line.match(/^\– (\d+)/)[1].to_i
|
60
|
+
tags[current_tag] << id
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
@tags = tags
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
def abbreviations
|
69
|
+
return @abbreviations if @abbreviations
|
70
|
+
abbreviations = Hash.new{|h,k| h[k] = []}
|
71
|
+
current_abbr = "A"
|
72
|
+
extract_abbreviation_data.each do |line|
|
73
|
+
if line.match(/^[A-ZÄÖÜ][A-ZÄÖÜa-zäöüß]+$/) && [current_abbr[0],current_abbr[0].next].include?(line[0])
|
74
|
+
current_abbr = line
|
75
|
+
elsif line.match(/^\– \d+/)
|
76
|
+
id = line.match(/^\– (\d+)/)[1].to_i
|
77
|
+
abbreviations[current_abbr] << id
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
@abbreviations = abbreviations
|
82
|
+
end
|
83
|
+
|
84
|
+
def last_update
|
85
|
+
date = @text_data.match /^Stand: (\d\d\.\d\d\.\d\d\d\d)/
|
86
|
+
Date.parse(date[1])
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
|
92
|
+
def organisations_data
|
93
|
+
start_lines = []
|
94
|
+
end_line = nil
|
95
|
+
|
96
|
+
@lines.each_with_index do |line,i|
|
97
|
+
if possible_organisation_id?(line) && begin_organisation?(@lines[i+1])
|
98
|
+
start_lines << i
|
99
|
+
elsif line == "Stichwortverzeichnis"
|
100
|
+
end_line = i - 1
|
101
|
+
break
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
organisations_data = start_lines.each_cons(2).map do |a,b|
|
106
|
+
@lines[a..b-1]
|
107
|
+
end
|
108
|
+
|
109
|
+
organisations_data.
|
110
|
+
push(@lines[start_lines.last..end_line]).
|
111
|
+
map { |data| data.reject {|line| ignored_line?(line)} }
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
def extract_tag_data
|
117
|
+
start_line = @lines.index {|line| line == "Stichwortverzeichnis"}
|
118
|
+
@lines.
|
119
|
+
drop(start_line+1).
|
120
|
+
take_while {|line| !(line == "Verzeichnis der anderen Namensformen")}.
|
121
|
+
reject {|line| ignored_line?(line)}
|
122
|
+
end
|
123
|
+
|
124
|
+
def extract_abbreviation_data
|
125
|
+
start_line = @lines.index {|line| line == "Verzeichnis der anderen Namensformen"}
|
126
|
+
@lines.
|
127
|
+
drop(start_line+1).
|
128
|
+
reject {|line| ignored_line?(line)}
|
129
|
+
end
|
130
|
+
|
131
|
+
def tags_for_organisation(organisation_id)
|
132
|
+
|
133
|
+
tags.
|
134
|
+
select {|_,organisation_ids| organisation_ids.include?(organisation_id)}.
|
135
|
+
map(&:first)
|
136
|
+
end
|
137
|
+
|
138
|
+
def abbreviations_for_organisation(organisation_id)
|
139
|
+
abbreviations.
|
140
|
+
select {|_,organisation_ids| organisation_ids.include?(organisation_id)}.
|
141
|
+
map(&:first)
|
142
|
+
end
|
143
|
+
|
144
|
+
|
145
|
+
def extract_names
|
146
|
+
names = {}
|
147
|
+
|
148
|
+
regexp = Regexp.compile(/<p><b>(\d+)\n<\/b>N a m e u n d S i t z \, 1 \. A d r e s s e\n<\/p>\n<p><b>(.*?)\n<\/b>/m)
|
149
|
+
|
150
|
+
@html_data.to_enum(:scan, regexp).each do
|
151
|
+
match = Regexp.last_match
|
152
|
+
names[match[1].to_i] = CGI.unescape_html(match[2].gsub("\n"," "))
|
153
|
+
end
|
154
|
+
|
155
|
+
@names = names
|
156
|
+
end
|
157
|
+
|
158
|
+
|
159
|
+
def ignored_line?(line)
|
160
|
+
regexps = [
|
161
|
+
/^– \d+ –$/,
|
162
|
+
/^Aktuelle Fassung der öffentlichen Liste/,
|
163
|
+
/^Die Zahlen verweisen auf die fortlaufenden Nummern im Hauptteil/,
|
164
|
+
/^\n$/
|
165
|
+
|
166
|
+
]
|
167
|
+
regexps.any? {|regexp| line.match(regexp)}
|
168
|
+
end
|
169
|
+
|
170
|
+
def possible_organisation_id?(line)
|
171
|
+
line =~ /^\d+$/
|
172
|
+
end
|
173
|
+
|
174
|
+
def begin_organisation?(line)
|
175
|
+
line =~/^N a m e u n d S i t z \, 1 \. A d r e s s e$/
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
module Factories
|
3
|
+
|
4
|
+
# This class is used to build an organisation from raw data
|
5
|
+
class OrganisationFactory
|
6
|
+
|
7
|
+
# @return [Lobbyliste::Organisation]
|
8
|
+
def self.build(name, raw_data,tags,abbreviations)
|
9
|
+
factory = new(name, raw_data)
|
10
|
+
::Lobbyliste::Organisation.new(
|
11
|
+
factory.id,
|
12
|
+
factory.name,
|
13
|
+
factory.address,
|
14
|
+
factory.additional_address,
|
15
|
+
factory.address_at_bt_br,
|
16
|
+
factory.people,
|
17
|
+
factory.interests,
|
18
|
+
factory.members,
|
19
|
+
factory.associated_organisations,
|
20
|
+
tags,
|
21
|
+
abbreviations
|
22
|
+
)
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :name
|
26
|
+
|
27
|
+
def initialize(name,raw_data)
|
28
|
+
@name = name
|
29
|
+
@raw_data = raw_data
|
30
|
+
end
|
31
|
+
|
32
|
+
def id
|
33
|
+
@raw_data.first.to_i
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def address
|
38
|
+
data = read_section("N a m e u n d S i t z , 1 . A d r e s s e")
|
39
|
+
AddressFactory.build(name, data, :primary)
|
40
|
+
end
|
41
|
+
|
42
|
+
def additional_address
|
43
|
+
data = read_section("W e i t e r e A d r e s s e")
|
44
|
+
return nil if data[0] == "–"
|
45
|
+
AddressFactory.build(name, data, :secondary)
|
46
|
+
end
|
47
|
+
|
48
|
+
def address_at_bt_br
|
49
|
+
data = read_section("A n s c h r i f t a m S i t z v o n B T u n d B R g")
|
50
|
+
return nil if data[0] == "–" || data[0].match(/\(s\. Abschnitt/)
|
51
|
+
AddressFactory.build(name, data, :secondary)
|
52
|
+
end
|
53
|
+
|
54
|
+
def people
|
55
|
+
data = read_section("V o r s t a n d u n d G e s c h ä f t s f ü h r u n g")
|
56
|
+
data.concat read_section("V e r b a n d s v e r t r e t e r / - i n n e n")
|
57
|
+
data.reject! {|line| ignored_person_line?(line)}
|
58
|
+
|
59
|
+
data.map { |person| PersonFactory.build(person) }.uniq.reject(&:nil?)
|
60
|
+
end
|
61
|
+
|
62
|
+
def interests
|
63
|
+
read_section("I n t e r e s s e n b e r e i c h").join(" ").gsub(/(- )(?=[a-z])/,"")
|
64
|
+
end
|
65
|
+
|
66
|
+
def members
|
67
|
+
read_section("M i t g l i e d e r z a h l")[0].to_i || nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def associated_organisations
|
71
|
+
read_section("A n z a h l d e r a n g e s c h l o s s e n e n O r g a n i s a t i o n e n")[0].to_i || nil
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
private
|
77
|
+
def new_section?(line)
|
78
|
+
line =~ /^([a-zA-Z\d\,\.\-\/äöüß]\s){3,}\w$/
|
79
|
+
end
|
80
|
+
|
81
|
+
def read_section(section)
|
82
|
+
start_line = @raw_data.index {|line| line == section}
|
83
|
+
return [] unless start_line
|
84
|
+
|
85
|
+
@raw_data.drop(start_line+1).take_while {|line| !new_section?(line)}
|
86
|
+
end
|
87
|
+
|
88
|
+
def ignored_person_line?(line)
|
89
|
+
[
|
90
|
+
/^–$/,
|
91
|
+
/\(s\. Abschnitt/,
|
92
|
+
/\:$/,
|
93
|
+
/^GdW$/,
|
94
|
+
/^Forschung$/,
|
95
|
+
/^des Verwaltungsrats$/,
|
96
|
+
/^Schatzmeister$/,
|
97
|
+
/^Kinder- u\. Jugendmed\.$/,
|
98
|
+
/^u\. Kinderchirurgen$/,
|
99
|
+
/^Finanzen & Recht I$/,
|
100
|
+
/^Geschäftsführ(er(in)?|ung)$/,
|
101
|
+
/^gleichzeitig Verbandsdirektor^/,
|
102
|
+
/^(stellvertretender )?Vorsitzender?$/,
|
103
|
+
/^weitere Vorstandsmitglieder$/,
|
104
|
+
/^Managementgesellschaft des DZVhÄ/,
|
105
|
+
/^Besonderer Vertreter nach § 30/,
|
106
|
+
/^Sektretär$/,
|
107
|
+
/^Alleingesellschafter: Ev\.Werk für/
|
108
|
+
].any? {|regexp| line.match(regexp) }
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
module Factories
|
3
|
+
|
4
|
+
# This class is used to build a person from raw line data
|
5
|
+
# It has to split titles from the actual name
|
6
|
+
class PersonFactory
|
7
|
+
|
8
|
+
# A list of regular expressions that try to match any titles that might occur
|
9
|
+
REGEX = [
|
10
|
+
# Prof titles
|
11
|
+
/Prof\.?/,
|
12
|
+
# Dr. titles
|
13
|
+
/(PhD|Dr\.(([a-z]+\.?)+|(\-\w+\.?))*)/,
|
14
|
+
# Dipl. titles
|
15
|
+
/(Magister|(Dipl[\.|\-]+)([\w|ÖöÄäÜüß]*[\.|\-]*)*)/,
|
16
|
+
# other academic abbreviations
|
17
|
+
/\b([a-z|A-Z]{2,}\.)+/,
|
18
|
+
# h.c.
|
19
|
+
/h\.\s?c\./,
|
20
|
+
# general abbreviations
|
21
|
+
/\(?\b(M\.?Sc|B\.?Sc|B\.?sc|FH|fh|BA|Ba|TH|Th|VWA|univ|US|PD|RA|CCM|LD|Ing\.|OTL|CISA|CIA|CISM|CRISC|StB|vBP|StD|habil|med)\b\)?/,
|
22
|
+
# job titles
|
23
|
+
/((^\s*und)?(Mathematiker|(Bundes)?Vorstand|^Arzt|Assistent|Generalstabs[aä]rzt|Augenoptikermeister|Bankkauf|Bau-Ing\.|Betriebswirtschaftslehre|Mediziner|agrar|Schneidwerkzeugmechanikermeister|Studienassessor|Gesundheitsökonom|Syndikus|Straßenbaumeister|^Bruder|Pater|Prälat|Theologe|Jurist|\bmed|Ministerpräsident|(Medizin)?pädagog|Verwaltungs-Wirt|Veterinär|^Kauf|Generalleutnant|General|Generalarzt|Ergotherapeut|Fregattenkapitän|Agrarbiolog|Amtsanw[aä]lt|Apotheker|(Freie )?Architekt|(Berg-)?Assessor|Betriebswirt|^Bischof( von [a-z]+)?|Botschafter|(vereidigter )?Buchprüfer|Bundesbankoberamtsrat|Bundesinnungsmeister|Bundesminister|Bundespräsident|Bundestagspräsident|Bürgermeister|Chefapotheker|Diakon|Dompropst|Augenoptiker|Optometrist|Biolog|Brennmeister|Chemiker|((Kommunikations|Grafik)[\-\s]?)?Designer|Finanzwirt|Forstwirt|Geograph|Geolog|Geophysiker|Handelslehrer|Holzwirt|Informatiker|Jurist|Kauf|Mathematiker|Medizinpädagog|Meteorolog|physiker|Politolog|Psycholog|Pädagog|Rechtspfleger|Restaurator|Sachverständig|Sozialpädagog|Sozialwirt|Sozialwissenschaftler|Soziolog|Stomatolog|Verwaltungsbetriebswirt|Verwaltungswirt|Verwaltungswissenschaftler|Volkswirt|Wirtschafts(ing)?|Wirtschaftsjurist|Ökonom|Übersetzer|Domkapitular|Probst|Finanzfachwirt|(Forst|Bau)assessor|Hauptfeldwebel|Hauptmann|Honorar|Honorargeneralkonsul|Honorarkonsul|Justizminister|Kapitän(leutnant)?|Konsul|Land(es)?rat|Landwirtschaftsmeister|Lohnsteuerberater|Landwirtschaftsdirektor|Luftverkehrskauf|Magistratsr[aä]t|BreigGen|Generalmajor|Major|Minist|Ministerialdirektor|Ministerialdirigent|Monsignore|Notar|Oberamtsanw[aä]lt|Oberbürgermeister|Oberfeldarzt|Obermeister|Oberst|Oberstaatsanw[aä]lt|Oberstabsboots|Oberstabsfeldwebel|Oberstleutnant|Oberstudiendirektor|Staatssekretär|Parlamentarischer|Parlamentspräsident|Patentanw[aä]lt|(Landesjugend)?[Pp]farrer|Politikwissenschaftler|Privatdozent|Priv(\\.|at)-?Dozent|RA|Ran|Fachanwalt( für [a-z]+)?|Realschulrektor|Rechtsreferent|Regierungsamts(rat)?|Regionspräsident|Revieroberjäger|(Vorsitzender? )?Richter(in)?( am [a-zA-Z]+)?|Senator|Staats(minister|sekretär)|Stabsfeldwebel|Stabshauptmann|(Steuer|Wirtschafts)(berater|prüfer)|Stuckateurmeister|Studiendirektor|Studienrat|Uni(v(ersitäts)?)?|Verleger|Rechtsjournalist|Rechtsjurist|Veterinärdirektor|Vizepräsident des Verwaltungsgerichts|Visuelle Kommunikation|Zahntechnikermeister)(er|e|\s?in|mann|frau)?)(\s?am\s(Finanzgericht|Bundesverwaltungsgerichtshof|Verwaltungsgerichtshof))?(\bD\.)?(\s?a\.D\.)?([\.\-,\s\/]+|$)/,
|
24
|
+
# Beamtenbezeichnung?
|
25
|
+
/^D\.\s/,
|
26
|
+
# more abbreviations
|
27
|
+
/\b[A-ZÜÄÖ]{2,}\b/,
|
28
|
+
# Everything after first colon
|
29
|
+
/,.*$/
|
30
|
+
]
|
31
|
+
|
32
|
+
|
33
|
+
# @return [Lobbylist::Person] builds a new person, might be nil if the line does not represent a person
|
34
|
+
def self.build(raw_data)
|
35
|
+
factory = new(raw_data)
|
36
|
+
factory.is_person? ? ::Lobbyliste::Person.new(factory.name,factory.titles) : nil
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(raw_data)
|
40
|
+
@raw_data = raw_data
|
41
|
+
@name = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
def name
|
45
|
+
return @name if @name
|
46
|
+
|
47
|
+
@name = @raw_data.dup
|
48
|
+
REGEX.each do |regex|
|
49
|
+
@name = clean(@name.gsub(regex,""))
|
50
|
+
end
|
51
|
+
|
52
|
+
@name
|
53
|
+
end
|
54
|
+
|
55
|
+
def titles
|
56
|
+
@raw_data.gsub(name,"").split(", ").map(&:squish).reject{|x| x==""}
|
57
|
+
end
|
58
|
+
|
59
|
+
def is_person?
|
60
|
+
!name.nil? && name.length > 2 && name.include?(" ")
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def clean(string)
|
66
|
+
string.gsub(/^(\s*[,-:\(\)\|\.])*/,"").squish
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
|
3
|
+
# This class represents an instance of the parsed lobbylist.
|
4
|
+
class List
|
5
|
+
|
6
|
+
# @return [Array] list of organisations
|
7
|
+
attr_reader :organisations
|
8
|
+
|
9
|
+
# @return [Hash] keys are the tags, values are Arrays of organisation ids
|
10
|
+
attr_reader :tags
|
11
|
+
|
12
|
+
# @return [Hash] keys are the abbreviations, values are Arrays of organisation ids
|
13
|
+
attr_reader :abbreviations
|
14
|
+
|
15
|
+
# @return [Date] the date when the document was last updated
|
16
|
+
attr_reader :last_update
|
17
|
+
|
18
|
+
def initialize(organisations, tags, abbreviations, last_update)
|
19
|
+
@organisations = organisations
|
20
|
+
@tags=tags
|
21
|
+
@abbreviations = abbreviations
|
22
|
+
@last_update = last_update
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_json(*a)
|
26
|
+
{
|
27
|
+
organisations: organisations,
|
28
|
+
tags: tags,
|
29
|
+
abbreviations: abbreviations,
|
30
|
+
last_update: last_update
|
31
|
+
}.to_json(*a)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
# Class to encapsulate an organisation
|
3
|
+
class Organisation
|
4
|
+
|
5
|
+
# @return [Integer] the organisation id of the organisation. This number is not fix and may change with every new document version
|
6
|
+
attr_reader :id
|
7
|
+
|
8
|
+
# @return [String] the organisations name
|
9
|
+
attr_reader :name
|
10
|
+
|
11
|
+
# @return [Lobbyliste::Address] the primary Address of the organisation
|
12
|
+
attr_reader :address
|
13
|
+
|
14
|
+
# @return [Lobbyliste::Address] the address stated under "Weitere Addresse"
|
15
|
+
attr_reader :additional_address
|
16
|
+
|
17
|
+
# @return [Lobbyliste::Address] the address stated under "Anschrift am Sitz von BT und BRg"
|
18
|
+
attr_reader :address_at_bt_br
|
19
|
+
|
20
|
+
# @return [Array] List of {Lobbyliste::Person} which includes all members stated under "Vorstand und Geschäftsführung" and "Verbandsvertreter/-innen"
|
21
|
+
attr_reader :people
|
22
|
+
|
23
|
+
# @return [String] interests as stated under "Interessenbereich"
|
24
|
+
attr_reader :interests
|
25
|
+
|
26
|
+
# @return [Integer] number of members as stated under "Mitgleiderzahl"
|
27
|
+
attr_reader :members
|
28
|
+
|
29
|
+
# @return [Integer] number of associated organisations as stated under "Anzahl der angeschlossenen Organisationen"
|
30
|
+
attr_reader :associated_organisations
|
31
|
+
|
32
|
+
# @return [Array] list of tags
|
33
|
+
attr_reader :tags
|
34
|
+
|
35
|
+
# @return [Array] list of abbreviations
|
36
|
+
attr_reader :abbreviations
|
37
|
+
|
38
|
+
def initialize(id, name, address, additional_address, address_at_bt_br, people, interests, members, associated_organisations,tags,abbreviations)
|
39
|
+
@id = id
|
40
|
+
@name = name
|
41
|
+
@address = address
|
42
|
+
@additional_address = additional_address
|
43
|
+
@address_at_bt_br = address_at_bt_br
|
44
|
+
@people = people
|
45
|
+
@interests = interests
|
46
|
+
@members = members
|
47
|
+
@associated_organisations = associated_organisations
|
48
|
+
@tags = tags
|
49
|
+
@abbreviations = abbreviations
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Array] list of all known addresses
|
53
|
+
def addresses
|
54
|
+
[address,additional_address,address_at_bt_br].reject(&:nil?)
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_json(*a)
|
58
|
+
{
|
59
|
+
id: id,
|
60
|
+
name: name,
|
61
|
+
addresses: addresses,
|
62
|
+
people: people,
|
63
|
+
interests: interests,
|
64
|
+
members: members,
|
65
|
+
associated_organisations: associated_organisations,
|
66
|
+
tags: tags,
|
67
|
+
abbreviations: abbreviations
|
68
|
+
}.to_json(*a)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Lobbyliste
|
2
|
+
# Class to encapsulate a person.
|
3
|
+
class Person
|
4
|
+
|
5
|
+
# @return [String] the persons name (hopefully) stripped of all titles
|
6
|
+
attr_reader :name
|
7
|
+
|
8
|
+
# @return [Array] list of all titles (job, academic, positions)
|
9
|
+
attr_reader :titles
|
10
|
+
|
11
|
+
def initialize(name, titles)
|
12
|
+
@name = name
|
13
|
+
@titles = titles
|
14
|
+
end
|
15
|
+
|
16
|
+
def ==(other)
|
17
|
+
name==other.name && titles==other.titles
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_json(*a)
|
21
|
+
{
|
22
|
+
name: name,
|
23
|
+
titles: titles
|
24
|
+
}.to_json(*a)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/lobbyliste.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "lobbyliste/version"
|
2
|
+
require "lobbyliste/factories"
|
3
|
+
require "lobbyliste/list"
|
4
|
+
require "lobbyliste/organisation"
|
5
|
+
require "lobbyliste/address"
|
6
|
+
require "lobbyliste/person"
|
7
|
+
require "lobbyliste/downloader"
|
8
|
+
require "lobbyliste/core_ext/string"
|
9
|
+
require 'json'
|
10
|
+
|
11
|
+
module Lobbyliste
|
12
|
+
|
13
|
+
# Download the PDF and parse it
|
14
|
+
# @return [Lobbyliste::Liste]
|
15
|
+
def self.fetch_and_parse
|
16
|
+
downloader = Lobbyliste::Downloader.new
|
17
|
+
Lobbyliste::Factories::ListFactory.build(downloader.text_data,downloader.html_data)
|
18
|
+
end
|
19
|
+
end
|
data/lobbyliste.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'lobbyliste/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "lobbyliste"
|
8
|
+
spec.version = Lobbyliste::VERSION
|
9
|
+
spec.authors = ["DarthMax"]
|
10
|
+
spec.email = ["max@kopfueber.org"]
|
11
|
+
|
12
|
+
spec.summary = %q{Ruby crawler for the list of lobbyists published by German Bundestag}
|
13
|
+
spec.description = %q{This gem crawls and parses the the list of lobbyists which is published as a PDF by the German Bundestag. }
|
14
|
+
spec.homepage = "https://github.com/DarthMax/lobbyliste"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.bindir = "bin"
|
19
|
+
spec.executables = ["lobbyliste"]
|
20
|
+
spec.require_paths = ["lib"]
|
21
|
+
|
22
|
+
spec.add_dependency 'nokogiri'
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.11"
|
25
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
26
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
27
|
+
spec.add_development_dependency "mocha"
|
28
|
+
spec.add_development_dependency "vcr"
|
29
|
+
spec.add_development_dependency "webmock"
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lobbyliste
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- DarthMax
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-06-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.11'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.11'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mocha
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: vcr
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: webmock
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: 'This gem crawls and parses the the list of lobbyists which is published
|
112
|
+
as a PDF by the German Bundestag. '
|
113
|
+
email:
|
114
|
+
- max@kopfueber.org
|
115
|
+
executables:
|
116
|
+
- lobbyliste
|
117
|
+
extensions: []
|
118
|
+
extra_rdoc_files: []
|
119
|
+
files:
|
120
|
+
- ".gitignore"
|
121
|
+
- ".travis.yml"
|
122
|
+
- CODE_OF_CONDUCT.md
|
123
|
+
- Gemfile
|
124
|
+
- LICENSE.txt
|
125
|
+
- README.md
|
126
|
+
- Rakefile
|
127
|
+
- bin/console
|
128
|
+
- bin/lobbyliste
|
129
|
+
- bin/setup
|
130
|
+
- ext/pdfbox.jar
|
131
|
+
- lib/lobbyliste.rb
|
132
|
+
- lib/lobbyliste/address.rb
|
133
|
+
- lib/lobbyliste/core_ext/string.rb
|
134
|
+
- lib/lobbyliste/downloader.rb
|
135
|
+
- lib/lobbyliste/factories.rb
|
136
|
+
- lib/lobbyliste/factories/address_factory.rb
|
137
|
+
- lib/lobbyliste/factories/list_factory.rb
|
138
|
+
- lib/lobbyliste/factories/organisation_factory.rb
|
139
|
+
- lib/lobbyliste/factories/person_factory.rb
|
140
|
+
- lib/lobbyliste/list.rb
|
141
|
+
- lib/lobbyliste/organisation.rb
|
142
|
+
- lib/lobbyliste/person.rb
|
143
|
+
- lib/lobbyliste/version.rb
|
144
|
+
- lobbyliste.gemspec
|
145
|
+
homepage: https://github.com/DarthMax/lobbyliste
|
146
|
+
licenses:
|
147
|
+
- MIT
|
148
|
+
metadata: {}
|
149
|
+
post_install_message:
|
150
|
+
rdoc_options: []
|
151
|
+
require_paths:
|
152
|
+
- lib
|
153
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
159
|
+
requirements:
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: '0'
|
163
|
+
requirements: []
|
164
|
+
rubyforge_project:
|
165
|
+
rubygems_version: 2.5.1
|
166
|
+
signing_key:
|
167
|
+
specification_version: 4
|
168
|
+
summary: Ruby crawler for the list of lobbyists published by German Bundestag
|
169
|
+
test_files: []
|
170
|
+
has_rdoc:
|