brandeins 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +0 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +50 -0
- data/README.md +2 -2
- data/Rakefile +18 -1
- data/lib/brandeins.rb +6 -310
- data/lib/brandeins/cli.rb +47 -0
- data/lib/brandeins/downloader.rb +112 -0
- data/lib/brandeins/merger/extensions.rb +9 -0
- data/lib/brandeins/merger/pdf_tools.rb +47 -0
- data/lib/brandeins/merger/templates/base.rb +47 -0
- data/lib/brandeins/merger/templates/ghostscript_windows.rb +25 -0
- data/lib/brandeins/merger/templates/osx.rb +11 -0
- data/lib/brandeins/merger/templates/pdftk_osx.rb +25 -0
- data/lib/brandeins/merger/templates/windows.rb +11 -0
- data/lib/brandeins/parser/archive_site.rb +54 -0
- data/lib/brandeins/parser/article_site.rb +26 -0
- data/lib/brandeins/parser/magazine_site.rb +49 -0
- data/lib/brandeins/setup.rb +4 -6
- data/lib/brandeins/version.rb +1 -1
- data/specs/brandeins_spec.rb +39 -0
- data/specs/spec_helper.rb +1 -0
- data/test/brandeins_test.rb +3 -4
- data/test/helper.rb +1 -19
- data/test_support/capture_stdout.rb +12 -0
- data/test_support/fixtures/brandeins_archiv.html +50 -0
- data/test_support/fixtures/cover.jpg +0 -0
- metadata +22 -5
- data/lib/brandeins/pdf-tools.rb +0 -89
@@ -0,0 +1,112 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'brandeins/parser/article_site'
|
3
|
+
require 'brandeins/parser/magazine_site'
|
4
|
+
require 'brandeins/parser/archive_site'
|
5
|
+
require 'brandeins/merger/pdf_tools'
|
6
|
+
require 'net/http'
|
7
|
+
require 'prawn'
|
8
|
+
|
9
|
+
module BrandEins
|
10
|
+
class Downloader
|
11
|
+
|
12
|
+
def initialize(path, opts = {})
|
13
|
+
$BE_VERBOSE = !!opts[:verbose]
|
14
|
+
|
15
|
+
@url = 'http://www.brandeins.de'
|
16
|
+
@dl_dir = path
|
17
|
+
@tmp_dir = @dl_dir + '/brand-eins-tmp'
|
18
|
+
@pdftool = BrandEins::Merger::PDFTools.get_pdf_tool
|
19
|
+
@archive = BrandEins::Parser::ArchiveSite.new(@url)
|
20
|
+
create_tmp_dirs
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_magazines_of_year(year = 2000)
|
24
|
+
puts "Getting all brand eins magazines of #{year}. This may take a while..." if $BE_VERBOSE
|
25
|
+
magazine_links_per_year = @archive.get_magazine_links_by_year(year)
|
26
|
+
magazine_links_per_year.each_with_index do |magazine_link, index|
|
27
|
+
volume = index+1
|
28
|
+
puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
|
29
|
+
target_pdf = pdf_filename(year, volume)
|
30
|
+
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def get_magazine(year = 2000, volume = 1)
|
35
|
+
puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
|
36
|
+
target_pdf = pdf_filename(year, volume)
|
37
|
+
magazine_links = @archive.get_magazine_links_by_year(year)
|
38
|
+
magazine_link = magazine_links[volume-1]
|
39
|
+
get_magazine_by_link(magazine_link, target_pdf, year, volume)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
def create_tmp_dirs
|
44
|
+
FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
|
45
|
+
end
|
46
|
+
|
47
|
+
def create_cover_pdf(year, volume)
|
48
|
+
puts "Creating cover for Volume #{volume} of #{year}" if $BE_VERBOSE
|
49
|
+
cover = @archive.get_magazine_cover(year, volume)
|
50
|
+
cover_title = cover[:title]
|
51
|
+
cover_img_url = cover[:img_url]
|
52
|
+
cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
|
53
|
+
cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
|
54
|
+
|
55
|
+
puts "Downloading cover image from #{cover_img_url} to #{cover_img_file}" if $BE_VERBOSE
|
56
|
+
IO.binwrite(cover_img_file, Net::HTTP.get(URI(cover_img_url)))
|
57
|
+
|
58
|
+
puts "Creating cover pdf #{cover_pdf_file} from #{cover_img_file}" if $BE_VERBOSE
|
59
|
+
Prawn::Document.generate(cover_pdf_file) do |pdf|
|
60
|
+
pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
|
61
|
+
pdf.image cover_img_file, :position => :center, :vposition => :center
|
62
|
+
end
|
63
|
+
|
64
|
+
return cover_pdf_file
|
65
|
+
end
|
66
|
+
|
67
|
+
def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
|
68
|
+
pdf_links = @archive.magazine_pdf_links(target_magazine_link)
|
69
|
+
pdf_files = download(pdf_links)
|
70
|
+
|
71
|
+
pdf_cover = create_cover_pdf(year, volume)
|
72
|
+
pdf_files = pdf_files.reverse.push(pdf_cover).reverse
|
73
|
+
|
74
|
+
if !@pdftool.nil?
|
75
|
+
target_pdf_path = "#{@dl_dir}/#{target_pdf}"
|
76
|
+
@pdftool.merge_pdf_files(pdf_files, target_pdf_path)
|
77
|
+
cleanup
|
78
|
+
else
|
79
|
+
puts 'brandeins wont merge the single pdf files since it didnt find an appropriate pdf tool' if $BE_VERBOSE
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def pdf_filename(year, volume)
|
84
|
+
"Brand-Eins-#{year}-#{volume}.pdf"
|
85
|
+
end
|
86
|
+
|
87
|
+
def download(pdf_links)
|
88
|
+
pdf_links.each_with_object([]) do |pdf_link, pdf_files|
|
89
|
+
pdf_filename = @tmp_dir + '/' + File.basename(pdf_link)
|
90
|
+
pdf_url = pdf_link
|
91
|
+
download_pdf(pdf_url, pdf_filename)
|
92
|
+
pdf_files << pdf_filename
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def download_pdf(pdf_url, pdf_filename)
|
97
|
+
if File.exists? pdf_filename
|
98
|
+
puts "File #{pdf_filename} seems to be already downloaded" if $BE_VERBOSE
|
99
|
+
return true
|
100
|
+
end
|
101
|
+
|
102
|
+
puts "Downloading PDF from #{pdf_url} to #{pdf_filename}" if $BE_VERBOSE
|
103
|
+
IO.binwrite(pdf_filename, Net::HTTP.get(URI(pdf_url)))
|
104
|
+
end
|
105
|
+
|
106
|
+
def cleanup
|
107
|
+
FileUtils.rm_r @tmp_dir
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'brandeins/merger/extensions'
|
3
|
+
require 'brandeins/merger/templates/base'
|
4
|
+
require 'brandeins/merger/templates/osx'
|
5
|
+
require 'brandeins/merger/templates/pdftk_osx'
|
6
|
+
require 'brandeins/merger/templates/windows'
|
7
|
+
require 'brandeins/merger/templates/ghostscript_windows'
|
8
|
+
|
9
|
+
module BrandEins
|
10
|
+
module Merger
|
11
|
+
class PDFTools
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def get_pdf_tool(env = nil)
|
15
|
+
@env = env || {}
|
16
|
+
@env[:os] ||= RUBY_PLATFORM
|
17
|
+
|
18
|
+
init_pdf_tools and return_pdf_tool
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def init_pdf_tools
|
23
|
+
template_group = choose_template_group
|
24
|
+
@pdf_tools = get_subclasses template_group
|
25
|
+
end
|
26
|
+
|
27
|
+
def choose_template_group
|
28
|
+
return BrandEins::Merger::Templates::Windows if @env[:os].include? 'w32'
|
29
|
+
return BrandEins::Merger::Templates::OSX if @env[:os].include? 'darwin'
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_subclasses(klass)
|
33
|
+
classes = []
|
34
|
+
klass.subclasses.each do |sklass|
|
35
|
+
classes << sklass.new
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def return_pdf_tool
|
40
|
+
@pdf_tools.first.new if @pdf_tools.length > 0
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class Base
|
8
|
+
|
9
|
+
def available?
|
10
|
+
_cmd_available? cmd, noop
|
11
|
+
end
|
12
|
+
|
13
|
+
def cmd; raise "Must be implemtented by the subclasses"; end
|
14
|
+
def noop; raise "Must be implemtented by the subclasses"; end
|
15
|
+
def args; raise "Must be implemtented by the subclasses"; end
|
16
|
+
|
17
|
+
def merge_pdf_files(pdf_files, target_pdf)
|
18
|
+
begin
|
19
|
+
pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
|
20
|
+
args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
|
21
|
+
puts "executing: #{cmd} #{args}"
|
22
|
+
_exec("#{cmd} #{args}")
|
23
|
+
rescue Exception => e
|
24
|
+
puts "error: #{e.inspect}"
|
25
|
+
return false
|
26
|
+
end
|
27
|
+
return true
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def _exec (cmd)
|
32
|
+
IO.popen(cmd)
|
33
|
+
end
|
34
|
+
|
35
|
+
def _cmd_available? (cmd, args)
|
36
|
+
begin
|
37
|
+
open("|#{cmd} #{args}").close
|
38
|
+
rescue Exception
|
39
|
+
return false
|
40
|
+
end
|
41
|
+
return true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class GhostscriptWin < BrandEins::Merger::Templates::Windows
|
8
|
+
|
9
|
+
def cmd
|
10
|
+
'gswin64c.exe'
|
11
|
+
end
|
12
|
+
|
13
|
+
def args
|
14
|
+
['-dNOPAUSE', '-dBATCH', '-sDEVICE=pdfwrite', '-sOutputFile=__target_pdf__', '__pdf_files__']
|
15
|
+
end
|
16
|
+
|
17
|
+
def noop
|
18
|
+
['--version']
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module BrandEins
|
4
|
+
module Merger
|
5
|
+
module Templates
|
6
|
+
|
7
|
+
class PdftkOSX < BrandEins::Merger::Templates::OSX
|
8
|
+
|
9
|
+
def cmd
|
10
|
+
'pdftk'
|
11
|
+
end
|
12
|
+
|
13
|
+
def args
|
14
|
+
['__pdf_files__', 'output', '__target_pdf__']
|
15
|
+
end
|
16
|
+
|
17
|
+
def noop
|
18
|
+
['--version']
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class ArchiveSite
|
8
|
+
|
9
|
+
def initialize(base_url, opts = {})
|
10
|
+
@base_url = base_url
|
11
|
+
@archive_url = @base_url + "/archiv.html"
|
12
|
+
if html = opts[:html]
|
13
|
+
@doc = Nokogiri::HTML(html)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def doc
|
18
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@archive_url)))
|
19
|
+
end
|
20
|
+
|
21
|
+
def get_magazine_links_by_year(year = 2000)
|
22
|
+
puts "Loading Magazine from year #{year}" if $BE_VERBOSE
|
23
|
+
magazine_nodes_with_meta = doc.css(".jahrgang-#{year} ul li")
|
24
|
+
magazine_nodes_with_meta.each_with_object([]) do |node, links|
|
25
|
+
if node['id'].nil? then
|
26
|
+
link = node.css('a')
|
27
|
+
if link[0].nil? then
|
28
|
+
next
|
29
|
+
end
|
30
|
+
href = link[0]['href']
|
31
|
+
links << @base_url + '/' + href
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get_magazine_cover(year, volume)
|
37
|
+
title = doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
|
38
|
+
img_url = ''
|
39
|
+
doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
|
40
|
+
img_url = node['src']
|
41
|
+
end
|
42
|
+
return { :title => title, :img_url => @base_url + '/' + img_url }
|
43
|
+
end
|
44
|
+
|
45
|
+
def magazine_pdf_links(url)
|
46
|
+
magazine = BrandEins::Parser::MagazineSite.new(url, @base_url)
|
47
|
+
magazine.get_magazine_pdf_links
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class ArticleSite
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
|
13
|
+
def doc
|
14
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_pdf_link
|
18
|
+
puts "Parsing Article: #{@url}" if $BE_VERBOSE
|
19
|
+
link = doc.css("div#sidebar ul li#downloaden a")
|
20
|
+
link[0] and link[0]['href']
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module BrandEins
|
6
|
+
module Parser
|
7
|
+
class MagazineSite
|
8
|
+
|
9
|
+
def initialize(url, base_url)
|
10
|
+
@url, @base_url = url, base_url
|
11
|
+
end
|
12
|
+
|
13
|
+
def doc
|
14
|
+
@doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_magazine_pdf_links
|
18
|
+
puts "Parsing #{@url}" if $BE_VERBOSE
|
19
|
+
[get_editorial_article_links, get_schwerpunkt_article_links].flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_schwerpunkt_article_links
|
23
|
+
get_links("div.articleList ul h4 a")
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_editorial_article_links
|
27
|
+
get_links(".editorial-links li a")
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_links(css_selector)
|
31
|
+
link_nodes = doc.css(css_selector)
|
32
|
+
link_nodes.each_with_object([]) do |node, links|
|
33
|
+
article_link = @base_url + '/' + node['href']
|
34
|
+
article = BrandEins::Parser::ArticleSite.new(article_link)
|
35
|
+
pdf_link = article.get_pdf_link
|
36
|
+
if pdf_link.nil? and $BE_VERBOSE then
|
37
|
+
puts "-------------------------------"
|
38
|
+
puts "No Content for: #{article_link}"
|
39
|
+
puts "-------------------------------"
|
40
|
+
else
|
41
|
+
links << @base_url + '/' + pdf_link
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/lib/brandeins/setup.rb
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
|
1
|
+
# encoding: utf-8
|
2
2
|
|
3
3
|
module BrandEins
|
4
4
|
class Setup
|
5
|
-
attr_reader :pdf_tool
|
6
5
|
|
7
|
-
def initialize(env =
|
8
|
-
env = Hash.new if env.nil?
|
6
|
+
def initialize(env = {})
|
9
7
|
@os = env[:os] || RUBY_PLATFORM
|
10
|
-
@pdf_tool = env[:pdf_tool] || BrandEins::PdfTools.get_pdf_tool(env)
|
8
|
+
@pdf_tool = env[:pdf_tool] || BrandEins::Merging::PdfTools.get_pdf_tool(env)
|
11
9
|
end
|
12
10
|
|
13
11
|
def run
|
@@ -37,4 +35,4 @@ module BrandEins
|
|
37
35
|
end
|
38
36
|
|
39
37
|
end
|
40
|
-
end
|
38
|
+
end
|