brandeins 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3-p286@brandeins"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.16.19 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ else
29
+ # If the environment file has not yet been created, use the RVM CLI to select.
30
+ rvm --create "$environment_id" || {
31
+ echo "Failed to create RVM environment '${environment_id}'."
32
+ return 1
33
+ }
34
+ fi
35
+
36
+ # If you use bundler, this might be useful to you:
37
+ # if [[ -s Gemfile ]] && {
38
+ # ! builtin command -v bundle >/dev/null ||
39
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
40
+ # }
41
+ # then
42
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
43
+ # gem install bundler
44
+ # fi
45
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
46
+ # then
47
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
48
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :test do
4
+ if ENV['RUBY_VERSION'][5,3] == '1.8'
5
+ gem 'minitest'
6
+ end
7
+ gem 'fakefs'
8
+ end
9
+
10
+
11
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # About BrandEins Downloader
2
+
3
+ BrandEins Downloader is a command line tool to download former volumes
4
+ of the german oeconimic magazine "Brand Eins". The articles of former
5
+ are available through there website and BrandEins Downloader takes all
6
+ these fragmented PDFs, downloads and merges them into a single pdf.
7
+
8
+
9
+ ## Requirements
10
+ BrandEins Downloader uses *pdftk* and depends on *ruby*, *rubygems*, and
11
+ several ruby libraries (that you can get through rubygems)
12
+
13
+
14
+ ## Install
15
+ `gem install brandeins-dl`
16
+
17
+
18
+ ## Usage
19
+
20
+ Download just one magazine
21
+
22
+ `brandeins download --path=/Path/where/to/download/the/files --year=2011 --volume=5`
23
+
24
+ Download the whole collecion of a certain year
25
+
26
+ `brandeins download_all --path=/Path/where/to/download/the/files --year=2011`
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require './lib/brandeins/version'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.pattern = 'test/*_test.rb'
7
+ end
8
+
9
+ task :build do
10
+ sh 'gem build brandeins.gemspec'
11
+ end
12
+
13
+ task :publish do
14
+ version = BrandEins::VERSION
15
+ sh "gem push brandeins-#{version}.gem"
16
+ end
17
+
18
+ task :default => :test
data/TODOS.md ADDED
@@ -0,0 +1,3 @@
1
+ # ToDos
2
+ gem build brandeins-dl.gemspec
3
+ gem push brandeins-dl-*.*.*.gem
data/bin/brandeins ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'brandeins'
4
+ BrandEins::CLI.start
data/brandeins.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "brandeins/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "brandeins"
7
+ s.version = BrandEins::VERSION
8
+ s.authors = ["Gregory Igelmund"]
9
+ s.email = ["gregory.igelmund@gmail.com"]
10
+ s.homepage = "http://www.grekko.de"
11
+ s.summary = %q{BrandEins gem allows you to download past volumes of the Brand Eins magazine}
12
+ s.description = %q{BrandEins gem offers a download command to download a specific or all volumes. Use `brandeins help` to find out more about it}
13
+
14
+ s.add_dependency "rake"
15
+ s.add_dependency "thor"
16
+ s.add_dependency "nokogiri"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.post_install_message =<<-EOT
24
+ BrandEins gem currently runs on unix systems only and depends on pdftk to merge downloaded pdfs.
25
+ Run `brandeins setup` to check if all requirements are met and for informations on how to meet them.
26
+
27
+
28
+ EOT
29
+ end
@@ -0,0 +1,9 @@
1
+ module BrandEinsHelper
2
+ def self.windows?
3
+ RUBY_PLATFORM.include? 'w32'
4
+ end
5
+
6
+ def self.osx?
7
+ RUBY_PLATFORM.include? 'darwin'
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ class BrandEinsSetup
2
+ def initialize
3
+ p 'Checking requirements for your system'
4
+ if pdftk?
5
+ p 'It seems you have pdftk installed on your system.'
6
+ else
7
+ p 'It seems you are missing pdfk on your system.'
8
+ p pdfk_install_instructions
9
+ end
10
+ end
11
+
12
+ def pdfk_install_instructions
13
+ 'Visit http://www.pdflabs.com/docs/install-pdftk/ to install pdftk on your system'
14
+ end
15
+
16
+ def pdftk?
17
+ cmd? 'pdftk --version', 'pdftk.com'
18
+ end
19
+
20
+ private
21
+ def cmd?(cmd, hint)
22
+ f = IO.popen cmd
23
+ f.readlines.each do |line|
24
+ if line.include? hint
25
+ return true
26
+ end
27
+ end
28
+ return false
29
+ end
30
+ end
31
+
32
+ BrandEinsSetup.new
@@ -0,0 +1,3 @@
1
+ module BrandEins
2
+ VERSION = '0.0.9'
3
+ end
data/lib/brandeins.rb ADDED
@@ -0,0 +1,256 @@
1
+ %w(
2
+ brandeins/version
3
+ brandeins/helper
4
+ nokogiri
5
+ open-uri
6
+ uri
7
+ fileutils
8
+ thor
9
+ ).each do |lib|
10
+ require lib
11
+ end
12
+
13
+ module BrandEins
14
+
15
+ class CLI < Thor
16
+ map '--version' => :version
17
+
18
+ desc '--version', 'Displays current version'
19
+ def version
20
+ p BrandEins::VERSION
21
+ end
22
+
23
+ desc 'download', 'Download past brand eins magazines (use `brandeins help download` to learn more about options)'
24
+ method_option :path, :type => :string
25
+ method_option :volume, :type => :numeric
26
+ method_option :year, :type => :numeric
27
+ def download
28
+ path = options.path || File.expand_path('../..', __FILE__)
29
+ p "path: #{path}"
30
+ return false
31
+ b1 = BrandEins::Downloader.new(options.path)
32
+ b1.get_magazine(options.year, options.volume)
33
+ end
34
+
35
+ desc 'setup', 'Checks if all requirements to use brandeins are met and gives instructions how to meet them'
36
+ def setup
37
+ if BrandEinsHelper.windows? || BrandEinsHelper.osx?
38
+ require 'brandeins/setup'
39
+ else
40
+ p 'Unknown/unsupported operating system. Please contact the gem author.'
41
+ end
42
+ end
43
+ end
44
+
45
+ class Downloader
46
+ attr_reader :archive
47
+
48
+ def initialize(path)
49
+ @url = 'http://www.brandeins.de'
50
+ @archive = false
51
+ @dl_dir = path
52
+ @tmp_dir = path + '/tmp'
53
+ create_tmp_dirs
54
+ end
55
+
56
+ def setup
57
+ @archive = ArchiveSite.new @url
58
+ end
59
+
60
+ def get_magazines_of_year(year = 2000)
61
+ setup
62
+ puts "Getting all brand eins magazines of a #{year}. This could take a while..."
63
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
64
+ magazine_links_per_year.each_with_index do |magazine_link, volume|
65
+ puts "Parsing Volume #{volume} of #{year}"
66
+ target_pdf = get_target_pdf(year, volume)
67
+ get_magazine_by_link(magazine_link, target_pdf)
68
+ end
69
+ end
70
+
71
+ def get_magazine(year = 2000, volume = 1)
72
+ setup
73
+ puts "Parsing Volume #{volume} of #{year}"
74
+ target_pdf = get_target_pdf(year, volume)
75
+
76
+ magazine_links = @archive.get_magazine_links_by_year(year)
77
+ target_magazine_link = magazine_links[volume-1]
78
+
79
+ get_magazine_by_link(target_magazine_link, target_pdf)
80
+ end
81
+
82
+ private
83
+ def create_tmp_dirs
84
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
85
+ end
86
+
87
+ def get_magazine_by_link(target_magazine_link, target_pdf)
88
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
89
+ process_pdf_links(pdf_links, target_pdf)
90
+ cleanup
91
+ end
92
+
93
+ def get_target_pdf(year, volume)
94
+ "Brand-Eins-#{year}-#{volume}.pdf"
95
+ end
96
+
97
+ def process_pdf_links(pdf_links, target_pdf)
98
+ pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
99
+ pdf_files = pdf_downloader.download_all
100
+ merge_pdfs(pdf_files, target_pdf)
101
+ end
102
+
103
+ def merge_pdfs(pdf_files, target_pdf)
104
+ puts "Merging single PDFs now"
105
+ pdf_sources = pdf_files.join(" ")
106
+ system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
107
+ end
108
+
109
+ def cleanup
110
+ FileUtils.rm_r @tmp_dir
111
+ end
112
+
113
+ class PDFDownloader
114
+
115
+ def initialize(pdf_links, dl_dir)
116
+ @dl_dir = dl_dir
117
+ @pdf_links = pdf_links
118
+ end
119
+
120
+ def download_all
121
+ pdf_files = Array.new
122
+ @pdf_links.each do |pdf_link|
123
+ pdf_name = @dl_dir + '/' + File.basename(pdf_link)
124
+ pdf_url = pdf_link
125
+ download_pdf(pdf_url, pdf_name)
126
+
127
+ pdf_files << pdf_name
128
+ end
129
+ pdf_files
130
+ end
131
+
132
+ private
133
+
134
+ def download_pdf(pdf_url, filename)
135
+ puts "Downloading PDF from #{pdf_url} to #{filename}"
136
+ File.open(filename,'w') do |f|
137
+ uri = URI.parse(pdf_url)
138
+ Net::HTTP.start(uri.host,uri.port) do |http|
139
+ http.request_get(uri.path) do |res|
140
+ res.read_body do |seg|
141
+ f << seg
142
+ #hack -- adjust to suit:
143
+ sleep 0.005
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
149
+
150
+ end
151
+
152
+ class ArchiveSite
153
+ attr_accessor :doc
154
+
155
+ def initialize(base_url, html = false)
156
+ @base_url = base_url
157
+ @archive_url = @base_url + "/archiv.html"
158
+ if html
159
+ @doc = Nokogiri::HTML(html)
160
+ end
161
+ end
162
+
163
+ def setup
164
+ return if defined?(@doc) != nil
165
+ @doc = Nokogiri::HTML(open(@archive_url))
166
+ end
167
+
168
+ def get_magazine_links_by_year(year = 2000)
169
+ setup
170
+ puts "Loading Magazine from year #{year}"
171
+ magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
172
+ magazine_links = Array.new
173
+ magazine_nodes_with_meta.each_with_index do |node, index|
174
+ if node['id'].nil? then
175
+ link = node.css('a')
176
+ if link[0].nil? then
177
+ next
178
+ end
179
+ href = link[0]['href']
180
+ magazine_links << @base_url + '/' + href
181
+ end
182
+ end
183
+ magazine_links
184
+ end
185
+
186
+ def magazine_pdf_links(url)
187
+ magazine = ArchiveMagazine.new(url, @base_url)
188
+ magazine.get_magazine_pdf_links
189
+ end
190
+
191
+ class ArchiveMagazine
192
+ attr_accessor :url, :doc
193
+
194
+ def initialize(url, base_url, html = false)
195
+ puts "Parsing #{url}"
196
+ @url = url
197
+ @base_url = base_url
198
+ @doc = Nokogiri::HTML(open(url))
199
+ end
200
+
201
+ def get_magazine_pdf_links
202
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
203
+
204
+ end
205
+
206
+ def get_schwerpunkt_article_links
207
+ get_links("div.articleList ul h4 a")
208
+ end
209
+
210
+ def get_editorial_article_links
211
+ get_links(".editorial-links li a")
212
+ end
213
+
214
+ def get_links(css_selector)
215
+ pdf_links = Array.new
216
+ link_nodes = @doc.css(css_selector)
217
+ link_nodes.each do |node|
218
+ article_link = @base_url + '/' + node['href']
219
+ article = MagazineArticle.new(article_link)
220
+ pdf_link = article.get_pdf_link
221
+ if pdf_link.nil? then
222
+ puts "------------------------------"
223
+ puts "No Content for: #{article_link}"
224
+ puts "------------------------------"
225
+ else
226
+ pdf_links << @base_url + '/' + pdf_link
227
+ end
228
+ end
229
+ pdf_links
230
+ end
231
+
232
+ class MagazineArticle
233
+ attr_accessor :url, :doc
234
+
235
+ def initialize(url)
236
+ puts "Parsing Article: #{url}"
237
+ @url = url
238
+ @doc = Nokogiri::HTML(open(url))
239
+ end
240
+
241
+ def get_pdf_link
242
+ link = @doc.css("div#sidebar ul li#downloaden a")
243
+ if link[0].nil? then
244
+ return nil
245
+ else
246
+ href = link[0]['href']
247
+ end
248
+ end
249
+
250
+ end
251
+
252
+ end
253
+ end
254
+
255
+ end
256
+ end
@@ -0,0 +1,38 @@
1
+ require File.expand_path('../../lib/brandeins' , __FILE__)
2
+ require 'minitest/autorun'
3
+ require 'fakefs/safe'
4
+
5
+ class TestBrandEinsDownload < MiniTest::Unit::TestCase
6
+ def setup
7
+ @base_url = 'http://www.brandeins.de'
8
+ @dir = 'bdl'
9
+ # Es muss eine Moeglichkeit geben beim Testen zu verhindern, dass
10
+ # die Existenz des Pfades geprueft wird.
11
+ # testing im allgemeinen: http://holmwood.id.au/~lindsay/2008/04/26/hints-for-testing-your-evolving-ruby-scripts/
12
+ end
13
+
14
+ def test_tmp_directories_get_created
15
+ FakeFS do
16
+ bdl = BrandEins::Downloader.new @dir
17
+ assert File.directory?(File.expand_path("./#{@dir}/tmp"))
18
+ end
19
+ end
20
+
21
+ def test_magazine_url_scraping
22
+ html = <<-EOF
23
+ <div class="jahrgang jahrgang-2012 jahrgang-latest">
24
+ <h4>2012</h4>
25
+ <ul>
26
+ <li><a href="magazin/nein-sagen.html" title="Zum Magazin brand eins Online 1 2012" onmouseover="switch_magazine(2012, 1)" onfocus="switch_magazine(2012, 1)"><img src="typo3temp/pics/b9d755e0d1.jpg" width="55" height="73" alt="Ausgabe 01/2012 SCHWERPUNKT NEIN SAGEN"></a> 1</li>
27
+ <li><a href="magazin/markenkommunikation.html" title="Zum Magazin brand eins Online 2 2012" onmouseover="switch_magazine(2012, 2)" onfocus="switch_magazine(2012, 2)"><img src="typo3temp/pics/1dccfc2c74.jpg" width="55" height="73" alt="Ausgabe 02/2012 SCHWERPUNKT Markenkommunikation"></a> 2</li>
28
+ </ul>
29
+ </div>
30
+ EOF
31
+
32
+ archive_site = BrandEins::Downloader::ArchiveSite.new @base_url, html
33
+ magazine_links = archive_site.get_magazine_links_by_year(2012)
34
+ assert_equal magazine_links.length, 2
35
+ assert_equal magazine_links[0], (@base_url + '/magazin/nein-sagen.html')
36
+ assert_equal magazine_links[1], (@base_url + '/magazin/markenkommunikation.html')
37
+ end
38
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: brandeins
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.9
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gregory Igelmund
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: thor
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: BrandEins gem offers a download command to download a specific or all
63
+ volumes. Use `brandeins help` to find out more about it
64
+ email:
65
+ - gregory.igelmund@gmail.com
66
+ executables:
67
+ - brandeins
68
+ extensions: []
69
+ extra_rdoc_files: []
70
+ files:
71
+ - .gitignore
72
+ - .rvmrc
73
+ - Gemfile
74
+ - README.md
75
+ - Rakefile
76
+ - TODOS.md
77
+ - bin/brandeins
78
+ - brandeins.gemspec
79
+ - lib/brandeins.rb
80
+ - lib/brandeins/helper.rb
81
+ - lib/brandeins/setup.rb
82
+ - lib/brandeins/version.rb
83
+ - test/brandeins_test.rb
84
+ homepage: http://www.grekko.de
85
+ licenses: []
86
+ post_install_message: ! 'BrandEins gem currently runs on unix systems only and depends
87
+ on pdftk to merge downloaded pdfs.
88
+
89
+ Run `brandeins setup` to check if all requirements are met and for informations
90
+ on how to meet them.
91
+
92
+
93
+
94
+ '
95
+ rdoc_options: []
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ segments:
105
+ - 0
106
+ hash: 78102480951930165
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ! '>='
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ segments:
114
+ - 0
115
+ hash: 78102480951930165
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 1.8.24
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: BrandEins gem allows you to download past volumes of the Brand Eins magazine
122
+ test_files:
123
+ - test/brandeins_test.rb