brandeins-dl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # About BrandEins Downloader
2
+
3
+ BrandEins Downloader is a command line tool to download former volumes
4
+ of the german oeconimic magazine "Brand Eins". The articles of former
5
+ are available through there website and BrandEins Downloader takes all
6
+ these fragmented PDFs, downloads and merges them into a single pdf.
7
+
8
+
9
+ ## Requirements
10
+ BrandEins Downloader uses *pdftk* and depends on *ruby*, *rubygems*, and
11
+ several ruby libraries (that you can get through rubygems)
12
+
13
+
14
+ ## Install
15
+ `gem install brandeins-dl`
16
+
17
+
18
+ ## Usage
19
+
20
+ Download just one magazine
21
+
22
+ `brandeins download --path=/Path/where/to/download/the/files --year=2011 --volume=5`
23
+
24
+ Download the whole collecion of a certain year
25
+
26
+ `brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/brandeins ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'brandeins-dl'
4
+ BrandEins::CLI.start
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "brandeins-dl/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "brandeins-dl"
7
+ s.version = BrandEins::VERSION
8
+ s.authors = ["Gregory Igelmund"]
9
+ s.email = ["gregory.igelmund@gmail.com"]
10
+ s.homepage = "http://www.grekko.de"
11
+ s.summary = %q{BrandEins Downloader allows you to download past volumes of the Brand Eins magazine}
12
+ s.description = %q{BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'}
13
+
14
+ #s.rubyforge_project = "brandeins-dl"
15
+ s.add_dependency "rake"
16
+ s.add_dependency "thor"
17
+ s.add_dependency "nokogiri"
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+ end
@@ -0,0 +1,3 @@
1
+ module BrandEins
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,228 @@
1
+ require "brandeins-dl/version"
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'uri'
5
+ require 'fileutils'
6
+ require 'thor'
7
+
8
+ module BrandEins
9
+
10
+ class CLI < Thor
11
+ desc "download_all", "Download all magazines of the defined year"
12
+ method_option :year, :type => :numeric, :required => true
13
+ method_option :path, :type => :string, :required => true
14
+ def download_all
15
+ b1 = BrandEins::Downloader.new(options.path)
16
+ b1.get_magazines_of_year(options.year)
17
+ end
18
+
19
+ desc "download", "Download all magazines of the defined year"
20
+ method_option :path, :type => :string, :required => true
21
+ method_option :volume, :type => :numeric, :required => true
22
+ method_option :year, :type => :numeric, :required => true
23
+ def download
24
+ b1 = BrandEins::Downloader.new(options.path)
25
+ b1.get_magazine(options.year, options.volume)
26
+ end
27
+ end
28
+
29
+ class Downloader
30
+ attr_reader :archive
31
+
32
+ def initialize(path)
33
+ @url = "http://www.brandeins.de"
34
+ @archive = ArchiveSite.new
35
+ @dl_dir = path
36
+
37
+ check_download_path
38
+ end
39
+
40
+ def check_download_path
41
+ Dir.mkdir(@dl_dir) unless File.exists?(@dl_dir)
42
+ end
43
+
44
+ def get_magazines_of_year(year = 2000)
45
+ puts "Getting all brand eins magazines of a #{year}. This could take a while..."
46
+ magazine_links_per_year = @archive.magazine_links_by_year(year)
47
+ magazine_links_per_year.each_with_index do |magazine_link, volume|
48
+ puts "Parsing Volume #{volume} of #{year}"
49
+ target_pdf = get_target_pdf(year, volume)
50
+ get_magazine_by_link(magazine_link, target_pdf)
51
+ end
52
+ end
53
+
54
+ def get_magazine(year = 2000, volume = 1)
55
+ puts "Parsing Volume #{volume} of #{year}"
56
+ target_pdf = get_target_pdf(year, volume)
57
+
58
+ magazine_links = @archive.magazine_links_by_year(year)
59
+ target_magazine_link = magazine_links[volume-1]
60
+
61
+ get_magazine_by_link(target_magazine_link, target_pdf)
62
+ end
63
+
64
+ def get_magazine_by_link(target_magazine_link, target_pdf)
65
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
66
+ process_pdf_links(pdf_links, target_pdf)
67
+ cleanup
68
+ end
69
+
70
+
71
+ def get_target_pdf(year, volume)
72
+ "Brand-Eins-#{year}-#{volume}.pdf"
73
+ end
74
+
75
+ def process_pdf_links(pdf_links, target_pdf)
76
+ pdf_downloader = PDFDownloader.new(pdf_links, @dl_dir)
77
+ pdf_files = pdf_downloader.download_all
78
+ merge_pdfs(pdf_files, target_pdf)
79
+ end
80
+
81
+ def merge_pdfs(pdf_files, target_pdf)
82
+ puts "Merging single PDFs now"
83
+ pdf_sources = pdf_files.join(" ")
84
+ system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
85
+ end
86
+
87
+ def cleanup
88
+ FileUtils.rm_r Dir.glob("#{@dl_dir}/*")
89
+ end
90
+
91
+ class PDFDownloader
92
+
93
+ def initialize(pdf_links, dl_dir)
94
+ @dl_dir = dl_dir
95
+ @pdf_links = pdf_links
96
+ end
97
+
98
+ def download_all
99
+ pdf_files = Array.new
100
+ @pdf_links.each do |pdf_link|
101
+ pdf_name = @dl_dir + '/' + File.basename(pdf_link)
102
+ pdf_url = pdf_link
103
+ download_pdf(pdf_url, pdf_name)
104
+
105
+ pdf_files << pdf_name
106
+ end
107
+ pdf_files
108
+ end
109
+
110
+ private
111
+
112
+ def download_pdf(pdf_url, filename)
113
+ puts "Downloading PDF from #{pdf_url} to #{filename}"
114
+ File.open(filename,'w') do |f|
115
+ uri = URI.parse(pdf_url)
116
+ Net::HTTP.start(uri.host,uri.port) do |http|
117
+ http.request_get(uri.path) do |res|
118
+ res.read_body do |seg|
119
+ f << seg
120
+ #hack -- adjust to suit:
121
+ sleep 0.005
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ end
129
+
130
+ class ArchiveSite
131
+
132
+ attr_accessor :doc
133
+
134
+ def initialize
135
+ @base_url = "http://www.brandeins.de"
136
+ @archive_url = @base_url + "/archiv.html"
137
+ @doc = Nokogiri::HTML(open(@archive_url))
138
+ end
139
+
140
+ def magazine_links_by_year(year = 2000)
141
+ puts "Loading Magazine from year #{year}"
142
+ magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
143
+ magazine_links = Array.new
144
+ magazine_nodes_with_meta.each_with_index do |node, index|
145
+ if node['id'].nil? then
146
+ link = node.css('a')
147
+ if link[0].nil? then
148
+ next
149
+ end
150
+ href = link[0]['href']
151
+ magazine_links << @base_url + '/' + href
152
+ end
153
+ end
154
+ magazine_links
155
+ end
156
+
157
+ def magazine_pdf_links(url)
158
+ magazine = ArchiveMagazine.new(url)
159
+ magazine.get_magazine_pdf_links
160
+ end
161
+
162
+ class ArchiveMagazine
163
+ attr_accessor :url, :doc
164
+
165
+ def initialize(url)
166
+ puts "Parsing #{url}"
167
+ @url = url
168
+ @base_url = "http://www.brandeins.de"
169
+ @doc = Nokogiri::HTML(open(url))
170
+ end
171
+
172
+ def get_magazine_pdf_links
173
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
174
+
175
+ end
176
+
177
+ def get_schwerpunkt_article_links
178
+ get_links("div.articleList ul h4 a")
179
+ end
180
+
181
+ def get_editorial_article_links
182
+ get_links(".editorial-links li a")
183
+ end
184
+
185
+ def get_links(css_selector)
186
+ pdf_links = Array.new
187
+ link_nodes = @doc.css(css_selector)
188
+ link_nodes.each do |node|
189
+ article_link = @base_url + '/' + node['href']
190
+ article = MagazineArticle.new(article_link)
191
+ pdf_link = article.get_pdf_link
192
+ if pdf_link.nil? then
193
+ puts "------------------------------"
194
+ puts "No Content for: #{article_link}"
195
+ puts "------------------------------"
196
+ else
197
+ pdf_links << @base_url + '/' + pdf_link
198
+ end
199
+ end
200
+ pdf_links
201
+ end
202
+
203
+ class MagazineArticle
204
+ attr_accessor :url, :doc
205
+
206
+ def initialize(url)
207
+ puts "Parsing Article: #{url}"
208
+ @url = url
209
+ @doc = Nokogiri::HTML(open(url))
210
+ end
211
+
212
+ def get_pdf_link
213
+ link = @doc.css("div#sidebar ul li#downloaden a")
214
+ if link[0].nil? then
215
+ return nil
216
+ else
217
+ href = link[0]['href']
218
+ end
219
+ end
220
+
221
+ end
222
+
223
+ end
224
+
225
+ end
226
+
227
+ end
228
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: brandeins-dl
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Gregory Igelmund
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-04 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: thor
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: nokogiri
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
49
+ email:
50
+ - gregory.igelmund@gmail.com
51
+ executables:
52
+ - brandeins
53
+ extensions: []
54
+
55
+ extra_rdoc_files: []
56
+
57
+ files:
58
+ - .gitignore
59
+ - Gemfile
60
+ - README.md
61
+ - Rakefile
62
+ - bin/brandeins
63
+ - brandeins-dl.gemspec
64
+ - lib/brandeins-dl.rb
65
+ - lib/brandeins-dl/version.rb
66
+ homepage: http://www.grekko.de
67
+ licenses: []
68
+
69
+ post_install_message:
70
+ rdoc_options: []
71
+
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.8.10
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine
93
+ test_files: []
94
+