brandeins-dl 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,3 +1,11 @@
1
1
  source "http://rubygems.org"
2
2
 
3
+ group :test do
4
+ if ENV['RUBY_VERSION'][5,3] == '1.8'
5
+ gem 'minitest'
6
+ end
7
+ gem 'fakefs'
8
+ end
9
+
10
+
3
11
  gemspec
data/Rakefile CHANGED
@@ -1 +1,6 @@
1
- require "bundler/gem_tasks"
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = 'test/*_test.rb'
6
+ end
@@ -1,3 +1,3 @@
1
1
  module BrandEins
2
- VERSION = "0.0.3"
2
+ VERSION = '0.0.4'
3
3
  end
data/lib/brandeins-dl.rb CHANGED
@@ -1,9 +1,13 @@
1
- require "brandeins-dl/version"
2
- require 'nokogiri'
3
- require 'open-uri'
4
- require 'uri'
5
- require 'fileutils'
6
- require 'thor'
1
+ %w(
2
+ brandeins-dl/version
3
+ nokogiri
4
+ open-uri
5
+ uri
6
+ fileutils
7
+ thor
8
+ ).each do |lib|
9
+ require lib
10
+ end
7
11
 
8
12
  module BrandEins
9
13
 
@@ -28,94 +32,99 @@ module BrandEins
28
32
 
29
33
  class Downloader
30
34
  attr_reader :archive
31
-
35
+
32
36
  def initialize(path)
33
- @url = "http://www.brandeins.de"
34
- @archive = ArchiveSite.new
37
+ @url = 'http://www.brandeins.de'
38
+ @archive = false
35
39
  @dl_dir = path
36
- @tmp_dir = path + "/tmp"
37
-
38
- check_download_path
40
+ @tmp_dir = path + '/tmp'
41
+ create_tmp_dirs
39
42
  end
40
43
 
41
- def check_download_path
42
- FileUtils.mkdir_p @tmp_dir unless File.exists?(@tmp_dir)
44
+ def setup
45
+ @archive = ArchiveSite.new @url
43
46
  end
44
-
47
+
45
48
  def get_magazines_of_year(year = 2000)
49
+ setup
46
50
  puts "Getting all brand eins magazines of a #{year}. This could take a while..."
47
- magazine_links_per_year = @archive.magazine_links_by_year(year)
51
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
48
52
  magazine_links_per_year.each_with_index do |magazine_link, volume|
49
53
  puts "Parsing Volume #{volume} of #{year}"
50
54
  target_pdf = get_target_pdf(year, volume)
51
55
  get_magazine_by_link(magazine_link, target_pdf)
52
56
  end
53
57
  end
54
-
58
+
55
59
  def get_magazine(year = 2000, volume = 1)
60
+ setup
56
61
  puts "Parsing Volume #{volume} of #{year}"
57
62
  target_pdf = get_target_pdf(year, volume)
58
-
59
- magazine_links = @archive.magazine_links_by_year(year)
63
+
64
+ magazine_links = @archive.get_magazine_links_by_year(year)
60
65
  target_magazine_link = magazine_links[volume-1]
61
-
66
+
62
67
  get_magazine_by_link(target_magazine_link, target_pdf)
63
68
  end
64
-
69
+
70
+ private
71
+ def create_tmp_dirs
72
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
73
+ end
74
+
65
75
  def get_magazine_by_link(target_magazine_link, target_pdf)
66
76
  pdf_links = @archive.magazine_pdf_links(target_magazine_link)
67
77
  process_pdf_links(pdf_links, target_pdf)
68
78
  cleanup
69
79
  end
70
-
71
-
80
+
72
81
  def get_target_pdf(year, volume)
73
82
  "Brand-Eins-#{year}-#{volume}.pdf"
74
83
  end
75
-
84
+
76
85
  def process_pdf_links(pdf_links, target_pdf)
77
86
  pdf_downloader = PDFDownloader.new(pdf_links, @tmp_dir)
78
87
  pdf_files = pdf_downloader.download_all
79
88
  merge_pdfs(pdf_files, target_pdf)
80
89
  end
81
-
90
+
82
91
  def merge_pdfs(pdf_files, target_pdf)
83
92
  puts "Merging single PDFs now"
84
93
  pdf_sources = pdf_files.join(" ")
85
94
  system "pdftk #{pdf_sources} output #{@dl_dir}/#{target_pdf}"
86
95
  end
87
-
96
+
88
97
  def cleanup
89
98
  FileUtils.rm_r @tmp_dir
90
99
  end
91
-
100
+
92
101
  class PDFDownloader
93
-
102
+
94
103
  def initialize(pdf_links, dl_dir)
95
104
  @dl_dir = dl_dir
96
105
  @pdf_links = pdf_links
97
106
  end
98
-
107
+
99
108
  def download_all
100
109
  pdf_files = Array.new
101
110
  @pdf_links.each do |pdf_link|
102
111
  pdf_name = @dl_dir + '/' + File.basename(pdf_link)
103
112
  pdf_url = pdf_link
104
113
  download_pdf(pdf_url, pdf_name)
105
-
114
+
106
115
  pdf_files << pdf_name
107
116
  end
108
117
  pdf_files
109
118
  end
110
-
111
- private
112
-
119
+
120
+ private
121
+
113
122
  def download_pdf(pdf_url, filename)
114
123
  puts "Downloading PDF from #{pdf_url} to #{filename}"
115
124
  File.open(filename,'w') do |f|
116
125
  uri = URI.parse(pdf_url)
117
- Net::HTTP.start(uri.host,uri.port) do |http|
118
- http.request_get(uri.path) do |res|
126
+ Net::HTTP.start(uri.host,uri.port) do |http|
127
+ http.request_get(uri.path) do |res|
119
128
  res.read_body do |seg|
120
129
  f << seg
121
130
  #hack -- adjust to suit:
@@ -125,20 +134,26 @@ module BrandEins
125
134
  end
126
135
  end
127
136
  end
128
-
137
+
129
138
  end
130
-
139
+
131
140
  class ArchiveSite
132
-
133
141
  attr_accessor :doc
134
-
135
- def initialize
136
- @base_url = "http://www.brandeins.de"
142
+
143
+ def initialize(base_url, html = false)
144
+ @base_url = base_url
137
145
  @archive_url = @base_url + "/archiv.html"
146
+ if html
147
+ @doc = Nokogiri::HTML(html)
148
+ end
149
+ end
150
+
151
+ def setup
152
+ return if defined?(@doc) != nil
138
153
  @doc = Nokogiri::HTML(open(@archive_url))
139
154
  end
140
-
141
- def magazine_links_by_year(year = 2000)
155
+
156
+ def get_magazine_links_by_year(year = 2000)
142
157
  puts "Loading Magazine from year #{year}"
143
158
  magazine_nodes_with_meta = @doc.css(".jahrgang-#{year} ul li")
144
159
  magazine_links = Array.new
@@ -154,35 +169,35 @@ module BrandEins
154
169
  end
155
170
  magazine_links
156
171
  end
157
-
172
+
158
173
  def magazine_pdf_links(url)
159
- magazine = ArchiveMagazine.new(url)
174
+ magazine = ArchiveMagazine.new(url, @base_url)
160
175
  magazine.get_magazine_pdf_links
161
176
  end
162
-
177
+
163
178
  class ArchiveMagazine
164
179
  attr_accessor :url, :doc
165
-
166
- def initialize(url)
180
+
181
+ def initialize(url, base_url, html = false)
167
182
  puts "Parsing #{url}"
168
183
  @url = url
169
- @base_url = "http://www.brandeins.de"
184
+ @base_url = base_url
170
185
  @doc = Nokogiri::HTML(open(url))
171
186
  end
172
-
187
+
173
188
  def get_magazine_pdf_links
174
189
  [get_editorial_article_links, get_schwerpunkt_article_links].flatten
175
-
190
+
176
191
  end
177
-
192
+
178
193
  def get_schwerpunkt_article_links
179
194
  get_links("div.articleList ul h4 a")
180
195
  end
181
-
196
+
182
197
  def get_editorial_article_links
183
198
  get_links(".editorial-links li a")
184
199
  end
185
-
200
+
186
201
  def get_links(css_selector)
187
202
  pdf_links = Array.new
188
203
  link_nodes = @doc.css(css_selector)
@@ -200,16 +215,16 @@ module BrandEins
200
215
  end
201
216
  pdf_links
202
217
  end
203
-
218
+
204
219
  class MagazineArticle
205
220
  attr_accessor :url, :doc
206
-
221
+
207
222
  def initialize(url)
208
223
  puts "Parsing Article: #{url}"
209
224
  @url = url
210
225
  @doc = Nokogiri::HTML(open(url))
211
226
  end
212
-
227
+
213
228
  def get_pdf_link
214
229
  link = @doc.css("div#sidebar ul li#downloaden a")
215
230
  if link[0].nil? then
@@ -222,7 +237,6 @@ module BrandEins
222
237
  end
223
238
 
224
239
  end
225
-
226
240
  end
227
241
 
228
242
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: brandeins-dl
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.3
5
+ version: 0.0.4
6
6
  platform: ruby
7
7
  authors:
8
8
  - Gregory Igelmund
@@ -10,10 +10,11 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2012-02-26 00:00:00 Z
13
+ date: 2012-10-06 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rake
17
+ prerelease: false
17
18
  requirement: &id001 !ruby/object:Gem::Requirement
18
19
  none: false
19
20
  requirements:
@@ -21,10 +22,10 @@ dependencies:
21
22
  - !ruby/object:Gem::Version
22
23
  version: "0"
23
24
  type: :runtime
24
- prerelease: false
25
25
  version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: thor
28
+ prerelease: false
28
29
  requirement: &id002 !ruby/object:Gem::Requirement
29
30
  none: false
30
31
  requirements:
@@ -32,10 +33,10 @@ dependencies:
32
33
  - !ruby/object:Gem::Version
33
34
  version: "0"
34
35
  type: :runtime
35
- prerelease: false
36
36
  version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: nokogiri
39
+ prerelease: false
39
40
  requirement: &id003 !ruby/object:Gem::Requirement
40
41
  none: false
41
42
  requirements:
@@ -43,7 +44,6 @@ dependencies:
43
44
  - !ruby/object:Gem::Version
44
45
  version: "0"
45
46
  type: :runtime
46
- prerelease: false
47
47
  version_requirements: *id003
48
48
  description: "BrandEins Downloader offers two commands: 'brandeins download YEAR' and 'brandeins download YEAR --volume=NUMBER'"
49
49
  email:
@@ -77,23 +77,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
77
77
  requirements:
78
78
  - - ">="
79
79
  - !ruby/object:Gem::Version
80
- hash: -1677672259199041797
81
- segments:
82
- - 0
83
80
  version: "0"
84
81
  required_rubygems_version: !ruby/object:Gem::Requirement
85
82
  none: false
86
83
  requirements:
87
84
  - - ">="
88
85
  - !ruby/object:Gem::Version
89
- hash: -1677672259199041797
90
- segments:
91
- - 0
92
86
  version: "0"
93
87
  requirements: []
94
88
 
95
89
  rubyforge_project:
96
- rubygems_version: 1.8.6
90
+ rubygems_version: 1.8.15
97
91
  signing_key:
98
92
  specification_version: 3
99
93
  summary: BrandEins Downloader allows you to download past volumes of the Brand Eins magazine