brandeins 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +0 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +50 -0
- data/README.md +2 -2
- data/Rakefile +18 -1
- data/lib/brandeins.rb +6 -310
- data/lib/brandeins/cli.rb +47 -0
- data/lib/brandeins/downloader.rb +112 -0
- data/lib/brandeins/merger/extensions.rb +9 -0
- data/lib/brandeins/merger/pdf_tools.rb +47 -0
- data/lib/brandeins/merger/templates/base.rb +47 -0
- data/lib/brandeins/merger/templates/ghostscript_windows.rb +25 -0
- data/lib/brandeins/merger/templates/osx.rb +11 -0
- data/lib/brandeins/merger/templates/pdftk_osx.rb +25 -0
- data/lib/brandeins/merger/templates/windows.rb +11 -0
- data/lib/brandeins/parser/archive_site.rb +54 -0
- data/lib/brandeins/parser/article_site.rb +26 -0
- data/lib/brandeins/parser/magazine_site.rb +49 -0
- data/lib/brandeins/setup.rb +4 -6
- data/lib/brandeins/version.rb +1 -1
- data/specs/brandeins_spec.rb +39 -0
- data/specs/spec_helper.rb +1 -0
- data/test/brandeins_test.rb +3 -4
- data/test/helper.rb +1 -19
- data/test_support/capture_stdout.rb +12 -0
- data/test_support/fixtures/brandeins_archiv.html +50 -0
- data/test_support/fixtures/cover.jpg +0 -0
- metadata +22 -5
- data/lib/brandeins/pdf-tools.rb +0 -89
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'brandeins/parser/article_site'
|
3
|
+
require 'brandeins/parser/magazine_site'
|
4
|
+
require 'brandeins/parser/archive_site'
|
5
|
+
require 'brandeins/merger/pdf_tools'
|
6
|
+
require 'net/http'
|
7
|
+
require 'prawn'
|
8
|
+
|
9
|
+
module BrandEins
|
10
|
+
class Downloader
|
11
|
+
|
12
|
+
def initialize(path, opts = {})
|
13
|
+
$BE_VERBOSE = !!opts[:verbose]
|
14
|
+
|
15
|
+
@url = 'http://www.brandeins.de'
|
16
|
+
@dl_dir = path
|
17
|
+
@tmp_dir = @dl_dir + '/brand-eins-tmp'
|
18
|
+
@pdftool = BrandEins::Merger::PDFTools.get_pdf_tool
|
19
|
+
@archive = BrandEins::Parser::ArchiveSite.new(@url)
|
20
|
+
create_tmp_dirs
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_magazines_of_year(year = 2000)
|
24
|
+
puts "Getting all brand eins magazines of #{year}. This may take a while..." if $BE_VERBOSE
|
25
|
+
magazine_links_per_year = @archive.get_magazine_links_by_year(year)
|
26
|
+
magazine_links_per_year.each_with_index do |magazine_link, index|
|
27
|
+
volume = index+1
|
28
|
+
puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
|
29
|
+
target_pdf = pdf_filename(year, volume)
|
30
|
+
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_magazine(year = 2000, volume = 1)
|
35
|
+
puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
|
36
|
+
target_pdf = pdf_filename(year, volume)
|
37
|
+
magazine_links = @archive.get_magazine_links_by_year(year)
|
38
|
+
magazine_link = magazine_links[volume-1]
|
39
|
+
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def create_tmp_dirs
|
44
|
+
FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
|
45
|
+
end
|
46
|
+
|
47
|
+
def create_cover_pdf(year, volume)
|
48
|
+
puts "Creating cover for Volume #{volume} of #{year}" if $BE_VERBOSE
|
49
|
+
cover = @archive.get_magazine_cover(year, volume)
|
50
|
+
cover_title = cover[:title]
|
51
|
+
cover_img_url = cover[:img_url]
|
52
|
+
cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
|
53
|
+
cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
|
54
|
+
|
55
|
+
puts "Downloading cover image from #{cover_img_url} to #{cover_img_file}" if $BE_VERBOSE
|
56
|
+
IO.binwrite(cover_img_file, Net::HTTP.get(URI(cover_img_url)))
|
57
|
+
|
58
|
+
puts "Creating cover pdf #{cover_pdf_file} from #{cover_img_file}" if $BE_VERBOSE
|
59
|
+
Prawn::Document.generate(cover_pdf_file) do |pdf|
|
60
|
+
pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
|
61
|
+
pdf.image cover_img_file, :position => :center, :vposition => :center
|
62
|
+
end
|
63
|
+
|
64
|
+
return cover_pdf_file
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
68
|
+
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
69
|
+
pdf_files = download(pdf_links)
|
70
|
+
|
71
|
+
pdf_cover = create_cover_pdf(year, volume)
|
72
|
+
pdf_files = pdf_files.reverse.push(pdf_cover).reverse
|
73
|
+
|
74
|
+
if !@pdftool.nil?
|
75
|
+
target_pdf_path = "#{@dl_dir}/#{target_pdf}"
|
76
|
+
@pdftool.merge_pdf_files(pdf_files, target_pdf_path)
|
77
|
+
cleanup
|
78
|
+
else
|
79
|
+
puts 'brandeins wont merge the single pdf files since it didnt find an appropriate pdf tool' if $BE_VERBOSE
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def pdf_filename(year, volume)
|
84
|
+
"Brand-Eins-#{year}-#{volume}.pdf"
|
85
|
+
end
|
86
|
+
|
87
|
+
def download(pdf_links)
|
88
|
+
pdf_links.each_with_object([]) do |pdf_link, pdf_files|
|
89
|
+
pdf_filename = @tmp_dir + '/' + File.basename(pdf_link)
|
90
|
+
pdf_url = pdf_link
|
91
|
+
download_pdf(pdf_url, pdf_filename)
|
92
|
+
pdf_files << pdf_filename
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def download_pdf(pdf_url, pdf_filename)
|
97
|
+
if File.exists? pdf_filename
|
98
|
+
puts "File #{pdf_filename} seems to be already downloaded" if $BE_VERBOSE
|
99
|
+
return true
|
100
|
+
end
|
101
|
+
|
102
|
+
puts "Downloading PDF from #{pdf_url} to #{pdf_filename}" if $BE_VERBOSE
|
103
|
+
IO.binwrite(pdf_filename, Net::HTTP.get(URI(pdf_url)))
|
104
|
+
end
|
105
|
+
|
106
|
+
def cleanup
|
107
|
+
FileUtils.rm_r @tmp_dir
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'brandeins/merger/extensions'
|
3
|
+
require 'brandeins/merger/templates/base'
|
4
|
+
require 'brandeins/merger/templates/osx'
|
5
|
+
require 'brandeins/merger/templates/pdftk_osx'
|
6
|
+
require 'brandeins/merger/templates/windows'
|
7
|
+
require 'brandeins/merger/templates/ghostscript_windows'
|
8
|
+
|
9
|
+
module BrandEins
|
10
|
+
module Merger
|
11
|
+
class PDFTools
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def get_pdf_tool(env = nil)
|
15
|
+
@env = env || {}
|
16
|
+
@env[:os] ||= RUBY_PLATFORM
|
17
|
+
|
18
|
+
init_pdf_tools and return_pdf_tool
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def init_pdf_tools
|
23
|
+
template_group = choose_template_group
|
24
|
+
@pdf_tools = get_subclasses template_group
|
25
|
+
end
|
26
|
+
|
27
|
+
def choose_template_group
|
28
|
+
return BrandEins::Merger::Templates::Windows if @env[:os].include? 'w32'
|
29
|
+
return BrandEins::Merger::Templates::OSX if @env[:os].include? 'darwin'
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_subclasses(klass)
|
33
|
+
classes = []
|
34
|
+
klass.subclasses.each do |sklass|
|
35
|
+
classes << sklass.new
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def return_pdf_tool
|
40
|
+
@pdf_tools.first.new if @pdf_tools.length > 0
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class Base
|
8
|
+
|
9
|
+
def available?
|
10
|
+
_cmd_available? cmd, noop
|
11
|
+
end
|
12
|
+
|
13
|
+
def cmd; raise "Must be implemtented by the subclasses"; end
|
14
|
+
def noop; raise "Must be implemtented by the subclasses"; end
|
15
|
+
def args; raise "Must be implemtented by the subclasses"; end
|
16
|
+
|
17
|
+
def merge_pdf_files(pdf_files, target_pdf)
|
18
|
+
begin
|
19
|
+
pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
|
20
|
+
args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
|
21
|
+
puts "executing: #{cmd} #{args}"
|
22
|
+
_exec("#{cmd} #{args}")
|
23
|
+
rescue Exception => e
|
24
|
+
puts "error: #{e.inspect}"
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
return true
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def _exec (cmd)
|
32
|
+
IO.popen(cmd)
|
33
|
+
end
|
34
|
+
|
35
|
+
def _cmd_available? (cmd, args)
|
36
|
+
begin
|
37
|
+
open("|#{cmd} #{args}").close
|
38
|
+
rescue Exception
|
39
|
+
return false
|
40
|
+
end
|
41
|
+
return true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class GhostscriptWin < BrandEins::Merger::Templates::Windows
|
8
|
+
|
9
|
+
def cmd
|
10
|
+
'gswin64c.exe'
|
11
|
+
end
|
12
|
+
|
13
|
+
def args
|
14
|
+
['-dNOPAUSE', '-dBATCH', '-sDEVICE=pdfwrite', '-sOutputFile=__target_pdf__', '__pdf_files__']
|
15
|
+
end
|
16
|
+
|
17
|
+
def noop
|
18
|
+
['--version']
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class PdftkOSX < BrandEins::Merger::Templates::OSX
|
8
|
+
|
9
|
+
def cmd
|
10
|
+
'pdftk'
|
11
|
+
end
|
12
|
+
|
13
|
+
def args
|
14
|
+
['__pdf_files__', 'output', '__target_pdf__']
|
15
|
+
end
|
16
|
+
|
17
|
+
def noop
|
18
|
+
['--version']
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class ArchiveSite
|
8
|
+
|
9
|
+
def initialize(base_url, opts = {})
|
10
|
+
@base_url = base_url
|
11
|
+
@archive_url = @base_url + "/archiv.html"
|
12
|
+
if html = opts[:html]
|
13
|
+
@doc = Nokogiri::HTML(html)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def doc
|
18
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@archive_url)))
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_magazine_links_by_year(year = 2000)
|
22
|
+
puts "Loading Magazine from year #{year}" if $BE_VERBOSE
|
23
|
+
magazine_nodes_with_meta = doc.css(".jahrgang-#{year} ul li")
|
24
|
+
magazine_nodes_with_meta.each_with_object([]) do |node, links|
|
25
|
+
if node['id'].nil? then
|
26
|
+
link = node.css('a')
|
27
|
+
if link[0].nil? then
|
28
|
+
next
|
29
|
+
end
|
30
|
+
href = link[0]['href']
|
31
|
+
links << @base_url + '/' + href
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_magazine_cover(year, volume)
|
37
|
+
title = doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
|
38
|
+
img_url = ''
|
39
|
+
doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
|
40
|
+
img_url = node['src']
|
41
|
+
end
|
42
|
+
return { :title => title, :img_url => @base_url + '/' + img_url }
|
43
|
+
end
|
44
|
+
|
45
|
+
def magazine_pdf_links(url)
|
46
|
+
magazine = BrandEins::Parser::MagazineSite.new(url, @base_url)
|
47
|
+
magazine.get_magazine_pdf_links
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class ArticleSite
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def doc
|
14
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_pdf_link
|
18
|
+
puts "Parsing Article: #{@url}" if $BE_VERBOSE
|
19
|
+
link = doc.css("div#sidebar ul li#downloaden a")
|
20
|
+
link[0] and link[0]['href']
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class MagazineSite
|
8
|
+
|
9
|
+
def initialize(url, base_url)
|
10
|
+
@url, @base_url = url, base_url
|
11
|
+
end
|
12
|
+
|
13
|
+
def doc
|
14
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_magazine_pdf_links
|
18
|
+
puts "Parsing #{@url}" if $BE_VERBOSE
|
19
|
+
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_schwerpunkt_article_links
|
23
|
+
get_links("div.articleList ul h4 a")
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_editorial_article_links
|
27
|
+
get_links(".editorial-links li a")
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_links(css_selector)
|
31
|
+
link_nodes = doc.css(css_selector)
|
32
|
+
link_nodes.each_with_object([]) do |node, links|
|
33
|
+
article_link = @base_url + '/' + node['href']
|
34
|
+
article = BrandEins::Parser::ArticleSite.new(article_link)
|
35
|
+
pdf_link = article.get_pdf_link
|
36
|
+
if pdf_link.nil? and $BE_VERBOSE then
|
37
|
+
puts "-------------------------------"
|
38
|
+
puts "No Content for: #{article_link}"
|
39
|
+
puts "-------------------------------"
|
40
|
+
else
|
41
|
+
links << @base_url + '/' + pdf_link
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/lib/brandeins/setup.rb
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
module BrandEins
|
4
4
|
class Setup
|
5
|
-
attr_reader :pdf_tool
|
6
5
|
|
7
|
-
def initialize(env =
|
8
|
-
env = Hash.new if env.nil?
|
6
|
+
def initialize(env = {})
|
9
7
|
@os = env[:os] || RUBY_PLATFORM
|
10
|
-
@pdf_tool = env[:pdf_tool] || BrandEins::PdfTools.get_pdf_tool(env)
|
8
|
+
@pdf_tool = env[:pdf_tool] || BrandEins::Merging::PdfTools.get_pdf_tool(env)
|
11
9
|
end
|
12
10
|
|
13
11
|
def run
|
@@ -37,4 +35,4 @@ module BrandEins
|
|
37
35
|
end
|
38
36
|
|
39
37
|
end
|
40
|
-
end
|
38
|
+
end
|