brandeins 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3-p286@brandeins"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.16.19 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ else
29
+ # If the environment file has not yet been created, use the RVM CLI to select.
30
+ rvm --create "$environment_id" || {
31
+ echo "Failed to create RVM environment '${environment_id}'."
32
+ return 1
33
+ }
34
+ fi
35
+
36
+ # If you use bundler, this might be useful to you:
37
+ # if [[ -s Gemfile ]] && {
38
+ # ! builtin command -v bundle >/dev/null ||
39
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
40
+ # }
41
+ # then
42
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
43
+ # gem install bundler
44
+ # fi
45
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
46
+ # then
47
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
48
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :test do
4
+ if ENV['RUBY_VERSION'][5,3] == '1.8'
5
+ gem 'minitest'
6
+ end
7
+ gem 'fakefs'
8
+ end
9
+
10
+
11
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # About BrandEins Downloader
2
+
3
+ BrandEins Downloader is a command line tool to download former volumes
4
+ of the german oeconimic magazine "Brand Eins". The articles of former
5
+ are available through there website and BrandEins Downloader takes all
6
+ these fragmented PDFs, downloads and merges them into a single pdf.
7
+
8
+
9
+ ## Requirements
10
+ BrandEins Downloader uses *pdftk* and depends on *ruby*, *rubygems*, and
11
+ several ruby libraries (that you can get through rubygems)
12
+
13
+
14
+ ## Install
15
+ `gem install brandeins-dl`
16
+
17
+
18
+ ## Usage
19
+
20
+ Download just one magazine
21
+
22
+ `brandeins download --path=/Path/where/to/download/the/files --year=2011 --volume=5`
23
+
24
+ Download the whole collecion of a certain year
25
+
26
+ `brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require './lib/brandeins/version'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.pattern = 'test/*_test.rb'
7
+ end
8
+
9
+ task :build do
10
+ sh 'gem build brandeins.gemspec'
11
+ end
12
+
13
+ task :publish do
14
+ version = BrandEins::VERSION
15
+ sh "gem push brandeins-#{version}.gem"
16
+ end
17
+
18
+ task :default => :test
data/TODOS.md ADDED
@@ -0,0 +1,3 @@
1
+ # ToDos
2
+ gem build brandeins-dl.gemspec
3
+ gem push brandeins-dl-*.*.*.gem
data/bin/brandeins ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'brandeins'
4
+ BrandEins::CLI.start
data/brandeins.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "brandeins/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "brandeins"
7
+ s.version = BrandEins::VERSION
8
+ s.authors = ["Gregory Igelmund"]
9
+ s.email = ["gregory.igelmund@gmail.com"]
10
+ s.homepage = "http://www.grekko.de"
11
+ s.summary = %q{BrandEins gem allows you to download past volumes of the Brand Eins magazine}
12
+ s.description = %q{BrandEins gem offers a download command to download a specific or all volumes. Use `brandeins help` to find out more about it}
13
+
14
+ s.add_dependency "rake"
15
+ s.add_dependency "thor"
16
+ s.add_dependency "nokogiri"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.post_install_message =<<-EOT
24
+ BrandEins gem currently runs on unix systems only and depends on pdftk to merge downloaded pdfs.
25
+ Run `brandeins setup` to check if all requirements are met and for informations on how to meet them.
26
+
27
+
28
+ EOT
29
+ end
@@ -0,0 +1,9 @@
1
+ module BrandEinsHelper
2
+ def self.windows?
3
+ RUBY_PLATFORM.include? 'w32'
4
+ end
5
+
6
+ def self.osx?
7
+ RUBY_PLATFORM.include? 'darwin'
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ class BrandEinsSetup
2
+ def initialize
3
+ p 'Checking requirements for your system'
4
+ if pdftk?
5
+ p 'It seems you have pdftk installed on your system.'
6
+ else
7
+ p 'It seems you are missing pdfk on your system.'
8
+ p pdfk_install_instructions
9
+ end
10
+ end
11
+
12
+ def pdfk_install_instructions
13
+ 'Visit http://www.pdflabs.com/docs/install-pdftk/ to install pdftk on your system'
14
+ end
15
+
16
+ def pdftk?
17
+ cmd? 'pdftk --version', 'pdftk.com'
18
+ end
19
+
20
+ private
21
+ def cmd?(cmd, hint)
22
+ f = IO.popen cmd
23
+ f.readlines.each do |line|
24
+ if line.include? hint
25
+ return true
26
+ end
27
+ end
28
+ return false
29
+ end
30
+ end
31
+
32
+ BrandEinsSetup.new
@@ -0,0 +1,3 @@
1
+ module BrandEins
2
+ VERSION = '0.0.9'
3
+ end
data/lib/brandeins.rb ADDED
@@ -0,0 +1,256 @@
1
+ %w(
2
+ brandeins/version
3
+ brandeins/helper
4
+ nokogiri
5
+ open-uri
6
+ uri
7
+ fileutils
8
+ thor
9
+ ).each do |lib|
10
+ require lib
11
+ end
12
+
13
+ module BrandEins
14
+
15
+ class CLI < Thor
16
+ map '--version' => :version
17
+
18
+ desc '--version', 'Displays current version'
19
+ def version
20
+ p BrandEins::VERSION
21
+ end
22
+
23
+ desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
24
+ method_option :path, :type => :string
25
+ method_option :volume, :type => :numeric
26
+ method_option :year, :type => :numeric
27
+ def download
28
+ path = options.path || File.expand_path('../..', __FILE__)
29
+ p "path: #{path}"
30
+ return false
31
+ b1 = BrandEins::Downloader.new(options.path)
32
+ b1.get_magazine(options.year, options.volume)
33
+ end
34
+
35
+ desc 'setup', 'Checks if all requirements to use brandeins are met and gives instructions how to meet them'
36
+ def setup
37
+ if BrandEinsHelper.windows? || BrandEinsHelper.osx?
38
+ require 'brandeins/setup'
39
+ else
40
+ p 'Unknown/unsupported operating system. Please contact the gem author.'
41
+ end
42
+ end
43
+ end
44
+
45
+ class Downloader
46
+ attr_reader :archive
47
+
48
+ def initialize(path)
49
+ @url = 'http://www.brandeins.de'
50
+ @archive = false
51
+ @dl_dir = path
52
+ @tmp_dir = path + '/tmp'
53
+ create_tmp_dirs
54
+ end
55
+
56
+ def setup
57
+ @archive = ArchiveSite.new @url
58
+ end
59
+
60
+ def get_magazines_of_year(year = 2000)
61
+ setup
62
+ puts "Getting all brand eins magazines of a #{year}. This could take a while..."
63
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
64
+ magazine_links_per_year.each_with_index do |magazine_link, volume|
65
+ puts "Parsing Volume #{volume} of #{year}"
66
+ target_pdf = get_target_pdf(year, volume)
67
+ get_magazine_by_link(magazine_link, target_pdf)
68
+ end
69
+ end
70
+
71
+ def get_magazine(year = 2000, volume = 1)
72
+ setup
73
+ puts "Parsing Volume #{volume} of #{year}"
74
+ target_pdf = get_target_pdf(year, volume)
75
+
76
+ magazine_links = @archive.get_magazine_links_by_year(year)
77
+ target_magazine_link = magazine_links[volume-1]
78
+
79
+ get_magazine_by_link(target_magazine_link, target_pdf)
80
+ end
81
+
82
+ private
83
+ def create_tmp_dirs
84
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
85
+ end
86
+
87
+ def get_magazine_by_link(target_magazine_link, target_pdf)
88
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
89
+ process_pdf_links(pdf_links, target_pdf)
90
+ cleanup
91
+ end
92
+
93
+ def get_target_pdf(year, volume)
94
+ "Brand-Eins-#{year}-#{volume}.pdf"
95
+ end
96
+
97
+ def process_pdf_links(pdf_links, target_pdf)
98
+ pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
99
+ pdf_files = pdf_downloader.download_all
100
+ merge_pdfs(pdf_files, target_pdf)
101
+ end
102
+
103
+ def merge_pdfs(pdf_files, target_pdf)
104
+ puts "Merging single PDFs now"
105
+ pdf_sources = pdf_files.join(" ")
106
+ system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
107
+ end
108
+
109
+ def cleanup
110
+ FileUtils.rm_r @tmp_dir
111
+ end
112
+
113
+ class PDFDownloader
114
+
115
+ def initialize(pdf_links, dl_dir)
116
+ @dl_dir = dl_dir
117
+ @pdf_links = pdf_links
118
+ end
119
+
120
+ def download_all
121
+ pdf_files = Array.new
122
+ @pdf_links.each do |pdf_link|
123
+ pdf_name = @dl_dir + '/' + File.basename(pdf_link)
124
+ pdf_url = pdf_link
125
+ download_pdf(pdf_url, pdf_name)
126
+
127
+ pdf_files << pdf_name
128
+ end
129
+ pdf_files
130
+ end
131
+
132
+ private
133
+
134
+ def download_pdf(pdf_url, filename)
135
+ puts "Downloading PDF from #{pdf_url} to #{filename}"
136
+ File.open(filename,'w') do |f|
137
+ uri = URI.parse(pdf_url)
138
+ Net::HTTP.start(uri.host,uri.port) do |http|
139
+ http.request_get(uri.path) do |res|
140
+ res.read_body do |seg|
141
+ f << seg
142
+ #hack -- adjust to suit:
143
+ sleep 0.005
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ end
151
+
152
+ class ArchiveSite
153
+ attr_accessor :doc
154
+
155
+ def initialize(base_url, html = false)
156
+ @base_url = base_url
157
+ @archive_url = @base_url + "/archiv.html"
158
+ if html
159
+ @doc = Nokogiri::HTML(html)
160
+ end
161
+ end
162
+
163
+ def setup
164
+ return if defined?(@doc) != nil
165
+ @doc = Nokogiri::HTML(open(@archive_url))
166
+ end
167
+
168
+ def get_magazine_links_by_year(year = 2000)
169
+ setup
170
+ puts "Loading Magazine from year #{year}"
171
+ magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
172
+ magazine_links = Array.new
173
+ magazine_nodes_with_meta.each_with_index do |node, index|
174
+ if node['id'].nil? then
175
+ link = node.css('a')
176
+ if link[0].nil? then
177
+ next
178
+ end
179
+ href = link[0]['href']
180
+ magazine_links << @base_url + '/' + href
181
+ end
182
+ end
183
+ magazine_links
184
+ end
185
+
186
+ def magazine_pdf_links(url)
187
+ magazine = ArchiveMagazine.new(url, @base_url)
188
+ magazine.get_magazine_pdf_links
189
+ end
190
+
191
+ class ArchiveMagazine
192
+ attr_accessor :url, :doc
193
+
194
+ def initialize(url, base_url, html = false)
195
+ puts "Parsing #{url}"
196
+ @url = url
197
+ @base_url = base_url
198
+ @doc = Nokogiri::HTML(open(url))
199
+ end
200
+
201
+ def get_magazine_pdf_links
202
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
203
+
204
+ end
205
+
206
+ def get_schwerpunkt_article_links
207
+ get_links("div.articleList ul h4 a")
208
+ end
209
+
210
+ def get_editorial_article_links
211
+ get_links(".editorial-links li a")
212
+ end
213
+
214
+ def get_links(css_selector)
215
+ pdf_links = Array.new
216
+ link_nodes = @doc.css(css_selector)
217
+ link_nodes.each do |node|
218
+ article_link = @base_url + '/' + node['href']
219
+ article = MagazineArticle.new(article_link)
220
+ pdf_link = article.get_pdf_link
221
+ if pdf_link.nil? then
222
+ puts "------------------------------"
223
+ puts "No Content for: #{article_link}"
224
+ puts "------------------------------"
225
+ else
226
+ pdf_links << @base_url + '/' + pdf_link
227
+ end
228
+ end
229
+ pdf_links
230
+ end
231
+
232
+ class MagazineArticle
233
+ attr_accessor :url, :doc
234
+
235
+ def initialize(url)
236
+ puts "Parsing Article: #{url}"
237
+ @url = url
238
+ @doc = Nokogiri::HTML(open(url))
239
+ end
240
+
241
+ def get_pdf_link
242
+ link = @doc.css("div#sidebar ul li#downloaden a")
243
+ if link[0].nil? then
244
+ return nil
245
+ else
246
+ href = link[0]['href']
247
+ end
248
+ end
249
+
250
+ end
251
+
252
+ end
253
+ end
254
+
255
+ end
256
+ end
@@ -0,0 +1,38 @@
1
+ require File.expand_path('../../lib/brandeins' , __FILE__)
2
+ require 'minitest/autorun'
3
+ require 'fakefs/safe'
4
+
5
+ class TestBrandEinsDownload < MiniTest::Unit::TestCase
6
+ def setup
7
+ @base_url = 'http://www.brandeins.de'
8
+ @dir = 'bdl'
9
+ # Es muss eine Moeglichkeit geben beim Testen zu verhindern, dass
10
+ # die Existenz des Pfades geprueft wird.
11
+ # testing im allgemeinen: http://holmwood.id.au/~lindsay/2008/04/26/hints-for-testing-your-evolving-ruby-scripts/
12
+ end
13
+
14
+ def test_tmp_directories_get_created
15
+ FakeFS do
16
+ bdl = BrandEins::Downloader.new @dir
17
+ assert File.directory?(File.expand_path("./#{@dir}/tmp"))
18
+ end
19
+ end
20
+
21
+ def test_magazine_url_scraping
22
+ html = <<-EOF
23
+ <div class="jahrgang jahrgang-2012 jahrgang-latest">
24
+ <h4>2012</h4>
25
+ <ul>
26
+ <li><a href="magazin/nein-sagen.html" title="Zum Magazin brand eins Online 1 2012" onmouseover="switch_magazine(2012, 1)" onfocus="switch_magazine(2012, 1)"><img src="typo3temp/pics/b9d755e0d1.jpg" width="55" height="73" alt="Ausgabe 01/2012 SCHWERPUNKT NEIN SAGEN"></a> 1</li>
27
+ <li><a href="magazin/markenkommunikation.html" title="Zum Magazin brand eins Online 2 2012" onmouseover="switch_magazine(2012, 2)" onfocus="switch_magazine(2012, 2)"><img src="typo3temp/pics/1dccfc2c74.jpg" width="55" height="73" alt="Ausgabe 02/2012 SCHWERPUNKT Markenkommunikation"></a> 2</li>
28
+ </ul>
29
+ </div>
30
+ EOF
31
+
32
+ archive_site = BrandEins::Downloader::ArchiveSite.new @base_url, html
33
+ magazine_links = archive_site.get_magazine_links_by_year(2012)
34
+ assert_equal magazine_links.length, 2
35
+ assert_equal magazine_links[0], (@base_url + '/magazin/nein-sagen.html')
36
+ assert_equal magazine_links[1], (@base_url + '/magazin/markenkommunikation.html')
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: brandeins
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.9
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gregory Igelmund
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thor
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: BrandEins gem offers a download command to download a specific or all
63
+ volumes. Use `brandeins help` to find out more about it
64
+ email:
65
+ - gregory.igelmund@gmail.com
66
+ executables:
67
+ - brandeins
68
+ extensions: []
69
+ extra_rdoc_files: []
70
+ files:
71
+ - .gitignore
72
+ - .rvmrc
73
+ - Gemfile
74
+ - README.md
75
+ - Rakefile
76
+ - TODOS.md
77
+ - bin/brandeins
78
+ - brandeins.gemspec
79
+ - lib/brandeins.rb
80
+ - lib/brandeins/helper.rb
81
+ - lib/brandeins/setup.rb
82
+ - lib/brandeins/version.rb
83
+ - test/brandeins_test.rb
84
+ homepage: http://www.grekko.de
85
+ licenses: []
86
+ post_install_message: ! 'BrandEins gem currently runs on unix systems only and depends
87
+ on pdftk to merge downloaded pdfs.
88
+
89
+ Run `brandeins setup` to check if all requirements are met and for informations
90
+ on how to meet them.
91
+
92
+
93
+
94
+ '
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ segments:
105
+ - 0
106
+ hash: 78102480951930165
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ! '>='
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ segments:
114
+ - 0
115
+ hash: 78102480951930165
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 1.8.24
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: BrandEins gem allows you to download past volumes of the Brand Eins magazine
122
+ test_files:
123
+ - test/brandeins_test.rb