brandeins-dl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # About BrandEins Downloader
2
+
3
+ BrandEins Downloader is a command line tool to download former volumes
4
+ of the german oeconimic magazine "Brand Eins". The articles of former
5
+ are available through there website and BrandEins Downloader takes all
6
+ these fragmented PDFs, downloads and merges them into a single pdf.
7
+
8
+
9
+ ## Requirements
10
+ BrandEins Downloader uses *pdftk* and depends on *ruby*, *rubygems*, and
11
+ several ruby libraries (that you can get through rubygems)
12
+
13
+
14
+ ## Install
15
+ `gem install brandeins-dl`
16
+
17
+
18
+ ## Usage
19
+
20
+ Download just one magazine
21
+
22
+ `brandeins download --path=/Path/where/to/download/the/files --year=2011 --volume=5`
23
+
24
+ Download the whole collecion of a certain year
25
+
26
+ `brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/brandeins ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'brandeins-dl'
4
+ BrandEins::CLI.start
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "brandeins-dl/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "brandeins-dl"
7
+ s.version = BrandEins::VERSION
8
+ s.authors = ["Gregory Igelmund"]
9
+ s.email = ["gregory.igelmund@gmail.com"]
10
+ s.homepage = "http://www.grekko.de"
11
+ s.summary = %q{BrandEins Downloader allows you to download past volumes of the Brand Eins magazine}
12
+ s.description = %q{BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'}
13
+
14
+ #s.rubyforge_project = "brandeins-dl"
15
+ s.add_dependency "rake"
16
+ s.add_dependency "thor"
17
+ s.add_dependency "nokogiri"
18
+
19
+ s.files = `git ls-files`.split("\n")
20
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
21
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
22
+ s.require_paths = ["lib"]
23
+ end
@@ -0,0 +1,3 @@
1
+ module BrandEins
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,228 @@
1
+ require "brandeins-dl/version"
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'uri'
5
+ require 'fileutils'
6
+ require 'thor'
7
+
8
+ module BrandEins
9
+
10
+ class CLI < Thor
11
+ desc "download_all", "Download all magazines of the defined year"
12
+ method_option :year, :type => :numeric, :required => true
13
+ method_option :path, :type => :string, :required => true
14
+ def download_all
15
+ b1 = BrandEins::Downloader.new(options.path)
16
+ b1.get_magazines_of_year(options.year)
17
+ end
18
+
19
+ desc "download", "Download all magazines of the defined year"
20
+ method_option :path, :type => :string, :required => true
21
+ method_option :volume, :type => :numeric, :required => true
22
+ method_option :year, :type => :numeric, :required => true
23
+ def download
24
+ b1 = BrandEins::Downloader.new(options.path)
25
+ b1.get_magazine(options.year, options.volume)
26
+ end
27
+ end
28
+
29
+ class Downloader
30
+ attr_reader :archive
31
+
32
+ def initialize(path)
33
+ @url = "http://www.brandeins.de"
34
+ @archive = ArchiveSite.new
35
+ @dl_dir = path
36
+
37
+ check_download_path
38
+ end
39
+
40
+ def check_download_path
41
+ Dir.mkdir(@dl_dir) unless File.exists?(@dl_dir)
42
+ end
43
+
44
+ def get_magazines_of_year(year = 2000)
45
+ puts "Getting all brand eins magazines of a #{year}. This could take a while..."
46
+ magazine_links_per_year = @archive.magazine_links_by_year(year)
47
+ magazine_links_per_year.each_with_index do |magazine_link, volume|
48
+ puts "Parsing Volume #{volume} of #{year}"
49
+ target_pdf = get_target_pdf(year, volume)
50
+ get_magazine_by_link(magazine_link, target_pdf)
51
+ end
52
+ end
53
+
54
+ def get_magazine(year = 2000, volume = 1)
55
+ puts "Parsing Volume #{volume} of #{year}"
56
+ target_pdf = get_target_pdf(year, volume)
57
+
58
+ magazine_links = @archive.magazine_links_by_year(year)
59
+ target_magazine_link = magazine_links[volume-1]
60
+
61
+ get_magazine_by_link(target_magazine_link, target_pdf)
62
+ end
63
+
64
+ def get_magazine_by_link(target_magazine_link, target_pdf)
65
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
66
+ process_pdf_links(pdf_links, target_pdf)
67
+ cleanup
68
+ end
69
+
70
+
71
+ def get_target_pdf(year, volume)
72
+ "Brand-Eins-#{year}-#{volume}.pdf"
73
+ end
74
+
75
+ def process_pdf_links(pdf_links, target_pdf)
76
+ pdf_downloader = PDFDownloader.new(pdf_links, @dl_dir)
77
+ pdf_files = pdf_downloader.download_all
78
+ merge_pdfs(pdf_files, target_pdf)
79
+ end
80
+
81
+ def merge_pdfs(pdf_files, target_pdf)
82
+ puts "Merging single PDFs now"
83
+ pdf_sources = pdf_files.join(" ")
84
+ system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
85
+ end
86
+
87
+ def cleanup
88
+ FileUtils.rm_r Dir.glob("#{@dl_dir}/*")
89
+ end
90
+
91
+ class PDFDownloader
92
+
93
+ def initialize(pdf_links, dl_dir)
94
+ @dl_dir = dl_dir
95
+ @pdf_links = pdf_links
96
+ end
97
+
98
+ def download_all
99
+ pdf_files = Array.new
100
+ @pdf_links.each do |pdf_link|
101
+ pdf_name = @dl_dir + '/' + File.basename(pdf_link)
102
+ pdf_url = pdf_link
103
+ download_pdf(pdf_url, pdf_name)
104
+
105
+ pdf_files << pdf_name
106
+ end
107
+ pdf_files
108
+ end
109
+
110
+ private
111
+
112
+ def download_pdf(pdf_url, filename)
113
+ puts "Downloading PDF from #{pdf_url} to #{filename}"
114
+ File.open(filename,'w') do |f|
115
+ uri = URI.parse(pdf_url)
116
+ Net::HTTP.start(uri.host,uri.port) do |http|
117
+ http.request_get(uri.path) do |res|
118
+ res.read_body do |seg|
119
+ f << seg
120
+ #hack -- adjust to suit:
121
+ sleep 0.005
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ end
129
+
130
+ class ArchiveSite
131
+
132
+ attr_accessor :doc
133
+
134
+ def initialize
135
+ @base_url = "http://www.brandeins.de"
136
+ @archive_url = @base_url + "/archiv.html"
137
+ @doc = Nokogiri::HTML(open(@archive_url))
138
+ end
139
+
140
+ def magazine_links_by_year(year = 2000)
141
+ puts "Loading Magazine from year #{year}"
142
+ magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
143
+ magazine_links = Array.new
144
+ magazine_nodes_with_meta.each_with_index do |node, index|
145
+ if node['id'].nil? then
146
+ link = node.css('a')
147
+ if link[0].nil? then
148
+ next
149
+ end
150
+ href = link[0]['href']
151
+ magazine_links << @base_url + '/' + href
152
+ end
153
+ end
154
+ magazine_links
155
+ end
156
+
157
+ def magazine_pdf_links(url)
158
+ magazine = ArchiveMagazine.new(url)
159
+ magazine.get_magazine_pdf_links
160
+ end
161
+
162
+ class ArchiveMagazine
163
+ attr_accessor :url, :doc
164
+
165
+ def initialize(url)
166
+ puts "Parsing #{url}"
167
+ @url = url
168
+ @base_url = "http://www.brandeins.de"
169
+ @doc = Nokogiri::HTML(open(url))
170
+ end
171
+
172
+ def get_magazine_pdf_links
173
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
174
+
175
+ end
176
+
177
+ def get_schwerpunkt_article_links
178
+ get_links("div.articleList ul h4 a")
179
+ end
180
+
181
+ def get_editorial_article_links
182
+ get_links(".editorial-links li a")
183
+ end
184
+
185
+ def get_links(css_selector)
186
+ pdf_links = Array.new
187
+ link_nodes = @doc.css(css_selector)
188
+ link_nodes.each do |node|
189
+ article_link = @base_url + '/' + node['href']
190
+ article = MagazineArticle.new(article_link)
191
+ pdf_link = article.get_pdf_link
192
+ if pdf_link.nil? then
193
+ puts "------------------------------"
194
+ puts "No Content for: #{article_link}"
195
+ puts "------------------------------"
196
+ else
197
+ pdf_links << @base_url + '/' + pdf_link
198
+ end
199
+ end
200
+ pdf_links
201
+ end
202
+
203
+ class MagazineArticle
204
+ attr_accessor :url, :doc
205
+
206
+ def initialize(url)
207
+ puts "Parsing Article: #{url}"
208
+ @url = url
209
+ @doc = Nokogiri::HTML(open(url))
210
+ end
211
+
212
+ def get_pdf_link
213
+ link = @doc.css("div#sidebar ul li#downloaden a")
214
+ if link[0].nil? then
215
+ return nil
216
+ else
217
+ href = link[0]['href']
218
+ end
219
+ end
220
+
221
+ end
222
+
223
+ end
224
+
225
+ end
226
+
227
+ end
228
+ end
metadata ADDED
@@ -0,0 +1,94 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: brandeins-dl
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Gregory Igelmund
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-11-04 00:00:00 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ prerelease: false
18
+ requirement: &id001 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ type: :runtime
25
+ version_requirements: *id001
26
+ - !ruby/object:Gem::Dependency
27
+ name: thor
28
+ prerelease: false
29
+ requirement: &id002 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ">="
33
+ - !ruby/object:Gem::Version
34
+ version: "0"
35
+ type: :runtime
36
+ version_requirements: *id002
37
+ - !ruby/object:Gem::Dependency
38
+ name: nokogiri
39
+ prerelease: false
40
+ requirement: &id003 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ type: :runtime
47
+ version_requirements: *id003
48
+ description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
49
+ email:
50
+ - gregory.igelmund@gmail.com
51
+ executables:
52
+ - brandeins
53
+ extensions: []
54
+
55
+ extra_rdoc_files: []
56
+
57
+ files:
58
+ - .gitignore
59
+ - Gemfile
60
+ - README.md
61
+ - Rakefile
62
+ - bin/brandeins
63
+ - brandeins-dl.gemspec
64
+ - lib/brandeins-dl.rb
65
+ - lib/brandeins-dl/version.rb
66
+ homepage: http://www.grekko.de
67
+ licenses: []
68
+
69
+ post_install_message:
70
+ rdoc_options: []
71
+
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ requirements: []
87
+
88
+ rubyforge_project:
89
+ rubygems_version: 1.8.10
90
+ signing_key:
91
+ specification_version: 3
92
+ summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine
93
+ test_files: []
94
+