brandeins-dl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +3 -0
- data/README.md +26 -0
- data/Rakefile +1 -0
- data/bin/brandeins +4 -0
- data/brandeins-dl.gemspec +23 -0
- data/lib/brandeins-dl/version.rb +3 -0
- data/lib/brandeins-dl.rb +228 -0
- metadata +94 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# About BrandEins Downloader
|
|
2
|
+
|
|
3
|
+
BrandEins Downloader is a command line tool to download former volumes
|
|
4
|
+
of the german oeconimic magazine "Brand Eins". The articles of former
|
|
5
|
+
are available through there website and BrandEins Downloader takes all
|
|
6
|
+
these fragmented PDFs, downloads and merges them into a single pdf.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## Requirements
|
|
10
|
+
BrandEins Downloader uses *pdftk* and depends on *ruby*, *rubygems*, and
|
|
11
|
+
several ruby libraries (that you can get through rubygems)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
`gem install brandeins-dl`
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
Download just one magazine
|
|
21
|
+
|
|
22
|
+
`brandeins download --path=/Path/where/to/download/the/files --year=2011 --volume=5`
|
|
23
|
+
|
|
24
|
+
Download the whole collecion of a certain year
|
|
25
|
+
|
|
26
|
+
`brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
|
data/Rakefile
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/brandeins
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
|
3
|
+
require "brandeins-dl/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |s|
|
|
6
|
+
s.name = "brandeins-dl"
|
|
7
|
+
s.version = BrandEins::VERSION
|
|
8
|
+
s.authors = ["Gregory Igelmund"]
|
|
9
|
+
s.email = ["gregory.igelmund@gmail.com"]
|
|
10
|
+
s.homepage = "http://www.grekko.de"
|
|
11
|
+
s.summary = %q{BrandEins Downloader allows you to download past volumes of the Brand Eins magazine}
|
|
12
|
+
s.description = %q{BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'}
|
|
13
|
+
|
|
14
|
+
#s.rubyforge_project = "brandeins-dl"
|
|
15
|
+
s.add_dependency "rake"
|
|
16
|
+
s.add_dependency "thor"
|
|
17
|
+
s.add_dependency "nokogiri"
|
|
18
|
+
|
|
19
|
+
s.files = `git ls-files`.split("\n")
|
|
20
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
|
21
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
|
22
|
+
s.require_paths = ["lib"]
|
|
23
|
+
end
|
data/lib/brandeins-dl.rb
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
require "brandeins-dl/version"
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'open-uri'
|
|
4
|
+
require 'uri'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
require 'thor'
|
|
7
|
+
|
|
8
|
+
module BrandEins
|
|
9
|
+
|
|
10
|
+
class CLI < Thor
|
|
11
|
+
desc "download_all", "Download all magazines of the defined year"
|
|
12
|
+
method_option :year, :type => :numeric, :required => true
|
|
13
|
+
method_option :path, :type => :string, :required => true
|
|
14
|
+
def download_all
|
|
15
|
+
b1 = BrandEins::Downloader.new(options.path)
|
|
16
|
+
b1.get_magazines_of_year(options.year)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
desc "download", "Download all magazines of the defined year"
|
|
20
|
+
method_option :path, :type => :string, :required => true
|
|
21
|
+
method_option :volume, :type => :numeric, :required => true
|
|
22
|
+
method_option :year, :type => :numeric, :required => true
|
|
23
|
+
def download
|
|
24
|
+
b1 = BrandEins::Downloader.new(options.path)
|
|
25
|
+
b1.get_magazine(options.year, options.volume)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class Downloader
|
|
30
|
+
attr_reader :archive
|
|
31
|
+
|
|
32
|
+
def initialize(path)
|
|
33
|
+
@url = "http://www.brandeins.de"
|
|
34
|
+
@archive = ArchiveSite.new
|
|
35
|
+
@dl_dir = path
|
|
36
|
+
|
|
37
|
+
check_download_path
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def check_download_path
|
|
41
|
+
Dir.mkdir(@dl_dir) unless File.exists?(@dl_dir)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def get_magazines_of_year(year = 2000)
|
|
45
|
+
puts "Getting all brand eins magazines of a #{year}. This could take a while..."
|
|
46
|
+
magazine_links_per_year = @archive.magazine_links_by_year(year)
|
|
47
|
+
magazine_links_per_year.each_with_index do |magazine_link, volume|
|
|
48
|
+
puts "Parsing Volume #{volume} of #{year}"
|
|
49
|
+
target_pdf = get_target_pdf(year, volume)
|
|
50
|
+
get_magazine_by_link(magazine_link, target_pdf)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def get_magazine(year = 2000, volume = 1)
|
|
55
|
+
puts "Parsing Volume #{volume} of #{year}"
|
|
56
|
+
target_pdf = get_target_pdf(year, volume)
|
|
57
|
+
|
|
58
|
+
magazine_links = @archive.magazine_links_by_year(year)
|
|
59
|
+
target_magazine_link = magazine_links[volume-1]
|
|
60
|
+
|
|
61
|
+
get_magazine_by_link(target_magazine_link, target_pdf)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def get_magazine_by_link(target_magazine_link, target_pdf)
|
|
65
|
+
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
|
66
|
+
process_pdf_links(pdf_links, target_pdf)
|
|
67
|
+
cleanup
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_target_pdf(year, volume)
|
|
72
|
+
"Brand-Eins-#{year}-#{volume}.pdf"
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def process_pdf_links(pdf_links, target_pdf)
|
|
76
|
+
pdf_downloader = PDFDownloader.new(pdf_links, @dl_dir)
|
|
77
|
+
pdf_files = pdf_downloader.download_all
|
|
78
|
+
merge_pdfs(pdf_files, target_pdf)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def merge_pdfs(pdf_files, target_pdf)
|
|
82
|
+
puts "Merging single PDFs now"
|
|
83
|
+
pdf_sources = pdf_files.join(" ")
|
|
84
|
+
system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def cleanup
|
|
88
|
+
FileUtils.rm_r Dir.glob("#{@dl_dir}/*")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
class PDFDownloader
|
|
92
|
+
|
|
93
|
+
def initialize(pdf_links, dl_dir)
|
|
94
|
+
@dl_dir = dl_dir
|
|
95
|
+
@pdf_links = pdf_links
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def download_all
|
|
99
|
+
pdf_files = Array.new
|
|
100
|
+
@pdf_links.each do |pdf_link|
|
|
101
|
+
pdf_name = @dl_dir + '/' + File.basename(pdf_link)
|
|
102
|
+
pdf_url = pdf_link
|
|
103
|
+
download_pdf(pdf_url, pdf_name)
|
|
104
|
+
|
|
105
|
+
pdf_files << pdf_name
|
|
106
|
+
end
|
|
107
|
+
pdf_files
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
private
|
|
111
|
+
|
|
112
|
+
def download_pdf(pdf_url, filename)
|
|
113
|
+
puts "Downloading PDF from #{pdf_url} to #{filename}"
|
|
114
|
+
File.open(filename,'w') do |f|
|
|
115
|
+
uri = URI.parse(pdf_url)
|
|
116
|
+
Net::HTTP.start(uri.host,uri.port) do |http|
|
|
117
|
+
http.request_get(uri.path) do |res|
|
|
118
|
+
res.read_body do |seg|
|
|
119
|
+
f << seg
|
|
120
|
+
#hack -- adjust to suit:
|
|
121
|
+
sleep 0.005
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
class ArchiveSite
|
|
131
|
+
|
|
132
|
+
attr_accessor :doc
|
|
133
|
+
|
|
134
|
+
def initialize
|
|
135
|
+
@base_url = "http://www.brandeins.de"
|
|
136
|
+
@archive_url = @base_url + "/archiv.html"
|
|
137
|
+
@doc = Nokogiri::HTML(open(@archive_url))
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def magazine_links_by_year(year = 2000)
|
|
141
|
+
puts "Loading Magazine from year #{year}"
|
|
142
|
+
magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
|
|
143
|
+
magazine_links = Array.new
|
|
144
|
+
magazine_nodes_with_meta.each_with_index do |node, index|
|
|
145
|
+
if node['id'].nil? then
|
|
146
|
+
link = node.css('a')
|
|
147
|
+
if link[0].nil? then
|
|
148
|
+
next
|
|
149
|
+
end
|
|
150
|
+
href = link[0]['href']
|
|
151
|
+
magazine_links << @base_url + '/' + href
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
magazine_links
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def magazine_pdf_links(url)
|
|
158
|
+
magazine = ArchiveMagazine.new(url)
|
|
159
|
+
magazine.get_magazine_pdf_links
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
class ArchiveMagazine
|
|
163
|
+
attr_accessor :url, :doc
|
|
164
|
+
|
|
165
|
+
def initialize(url)
|
|
166
|
+
puts "Parsing #{url}"
|
|
167
|
+
@url = url
|
|
168
|
+
@base_url = "http://www.brandeins.de"
|
|
169
|
+
@doc = Nokogiri::HTML(open(url))
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def get_magazine_pdf_links
|
|
173
|
+
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
|
174
|
+
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def get_schwerpunkt_article_links
|
|
178
|
+
get_links("div.articleList ul h4 a")
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def get_editorial_article_links
|
|
182
|
+
get_links(".editorial-links li a")
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def get_links(css_selector)
|
|
186
|
+
pdf_links = Array.new
|
|
187
|
+
link_nodes = @doc.css(css_selector)
|
|
188
|
+
link_nodes.each do |node|
|
|
189
|
+
article_link = @base_url + '/' + node['href']
|
|
190
|
+
article = MagazineArticle.new(article_link)
|
|
191
|
+
pdf_link = article.get_pdf_link
|
|
192
|
+
if pdf_link.nil? then
|
|
193
|
+
puts "------------------------------"
|
|
194
|
+
puts "No Content for: #{article_link}"
|
|
195
|
+
puts "------------------------------"
|
|
196
|
+
else
|
|
197
|
+
pdf_links << @base_url + '/' + pdf_link
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
pdf_links
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
class MagazineArticle
|
|
204
|
+
attr_accessor :url, :doc
|
|
205
|
+
|
|
206
|
+
def initialize(url)
|
|
207
|
+
puts "Parsing Article: #{url}"
|
|
208
|
+
@url = url
|
|
209
|
+
@doc = Nokogiri::HTML(open(url))
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def get_pdf_link
|
|
213
|
+
link = @doc.css("div#sidebar ul li#downloaden a")
|
|
214
|
+
if link[0].nil? then
|
|
215
|
+
return nil
|
|
216
|
+
else
|
|
217
|
+
href = link[0]['href']
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
end
|
|
228
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: brandeins-dl
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease:
|
|
5
|
+
version: 0.0.1
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- Gregory Igelmund
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
|
|
13
|
+
date: 2011-11-04 00:00:00 Z
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: rake
|
|
17
|
+
prerelease: false
|
|
18
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
19
|
+
none: false
|
|
20
|
+
requirements:
|
|
21
|
+
- - ">="
|
|
22
|
+
- !ruby/object:Gem::Version
|
|
23
|
+
version: "0"
|
|
24
|
+
type: :runtime
|
|
25
|
+
version_requirements: *id001
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: thor
|
|
28
|
+
prerelease: false
|
|
29
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
|
30
|
+
none: false
|
|
31
|
+
requirements:
|
|
32
|
+
- - ">="
|
|
33
|
+
- !ruby/object:Gem::Version
|
|
34
|
+
version: "0"
|
|
35
|
+
type: :runtime
|
|
36
|
+
version_requirements: *id002
|
|
37
|
+
- !ruby/object:Gem::Dependency
|
|
38
|
+
name: nokogiri
|
|
39
|
+
prerelease: false
|
|
40
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
|
41
|
+
none: false
|
|
42
|
+
requirements:
|
|
43
|
+
- - ">="
|
|
44
|
+
- !ruby/object:Gem::Version
|
|
45
|
+
version: "0"
|
|
46
|
+
type: :runtime
|
|
47
|
+
version_requirements: *id003
|
|
48
|
+
description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
|
|
49
|
+
email:
|
|
50
|
+
- gregory.igelmund@gmail.com
|
|
51
|
+
executables:
|
|
52
|
+
- brandeins
|
|
53
|
+
extensions: []
|
|
54
|
+
|
|
55
|
+
extra_rdoc_files: []
|
|
56
|
+
|
|
57
|
+
files:
|
|
58
|
+
- .gitignore
|
|
59
|
+
- Gemfile
|
|
60
|
+
- README.md
|
|
61
|
+
- Rakefile
|
|
62
|
+
- bin/brandeins
|
|
63
|
+
- brandeins-dl.gemspec
|
|
64
|
+
- lib/brandeins-dl.rb
|
|
65
|
+
- lib/brandeins-dl/version.rb
|
|
66
|
+
homepage: http://www.grekko.de
|
|
67
|
+
licenses: []
|
|
68
|
+
|
|
69
|
+
post_install_message:
|
|
70
|
+
rdoc_options: []
|
|
71
|
+
|
|
72
|
+
require_paths:
|
|
73
|
+
- lib
|
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
|
+
none: false
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
version: "0"
|
|
80
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
81
|
+
none: false
|
|
82
|
+
requirements:
|
|
83
|
+
- - ">="
|
|
84
|
+
- !ruby/object:Gem::Version
|
|
85
|
+
version: "0"
|
|
86
|
+
requirements: []
|
|
87
|
+
|
|
88
|
+
rubyforge_project:
|
|
89
|
+
rubygems_version: 1.8.10
|
|
90
|
+
signing_key:
|
|
91
|
+
specification_version: 3
|
|
92
|
+
summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine
|
|
93
|
+
test_files: []
|
|
94
|
+
|