brandeins 0.1.6 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,112 @@
1
+ # encoding: utf-8
2
+ require 'brandeins/parser/article_site'
3
+ require 'brandeins/parser/magazine_site'
4
+ require 'brandeins/parser/archive_site'
5
+ require 'brandeins/merger/pdf_tools'
6
+ require 'net/http'
7
+ require 'prawn'
8
+
9
+ module BrandEins
10
+ class Downloader
11
+
12
+ def initialize(path, opts = {})
13
+ $BE_VERBOSE = !!opts[:verbose]
14
+
15
+ @url = 'http://www.brandeins.de'
16
+ @dl_dir = path
17
+ @tmp_dir = @dl_dir + '/brand-eins-tmp'
18
+ @pdftool = BrandEins::Merger::PDFTools.get_pdf_tool
19
+ @archive = BrandEins::Parser::ArchiveSite.new(@url)
20
+ create_tmp_dirs
21
+ end
22
+
23
+ def get_magazines_of_year(year = 2000)
24
+ puts "Getting all brand eins magazines of #{year}. This may take a while..." if $BE_VERBOSE
25
+ magazine_links_per_year = @archive.get_magazine_links_by_year(year)
26
+ magazine_links_per_year.each_with_index do |magazine_link, index|
27
+ volume = index+1
28
+ puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
29
+ target_pdf = pdf_filename(year, volume)
30
+ get_magazine_by_link(magazine_link, target_pdf, year, volume)
31
+ end
32
+ end
33
+
34
+ def get_magazine(year = 2000, volume = 1)
35
+ puts "Parsing Volume #{volume} of #{year}" if $BE_VERBOSE
36
+ target_pdf = pdf_filename(year, volume)
37
+ magazine_links = @archive.get_magazine_links_by_year(year)
38
+ magazine_link = magazine_links[volume-1]
39
+ get_magazine_by_link(magazine_link, target_pdf, year, volume)
40
+ end
41
+
42
+ private
43
+ def create_tmp_dirs
44
+ FileUtils.mkdir_p @tmp_dir unless File.directory?(@tmp_dir)
45
+ end
46
+
47
+ def create_cover_pdf(year, volume)
48
+ puts "Creating cover for Volume #{volume} of #{year}" if $BE_VERBOSE
49
+ cover = @archive.get_magazine_cover(year, volume)
50
+ cover_title = cover[:title]
51
+ cover_img_url = cover[:img_url]
52
+ cover_img_file = @tmp_dir + "/cover-#{year}-#{volume}.jpg"
53
+ cover_pdf_file = @tmp_dir + "/cover-#{year}-#{volume}.pdf"
54
+
55
+ puts "Downloading cover image from #{cover_img_url} to #{cover_img_file}" if $BE_VERBOSE
56
+ IO.binwrite(cover_img_file, Net::HTTP.get(URI(cover_img_url)))
57
+
58
+ puts "Creating cover pdf #{cover_pdf_file} from #{cover_img_file}" if $BE_VERBOSE
59
+ Prawn::Document.generate(cover_pdf_file) do |pdf|
60
+ pdf.text "<font size='18'><b>" + cover_title + "</b></font>", :align => :center, :inline_format => true
61
+ pdf.image cover_img_file, :position => :center, :vposition => :center
62
+ end
63
+
64
+ return cover_pdf_file
65
+ end
66
+
67
+ def get_magazine_by_link(target_magazine_link, target_pdf, year, volume)
68
+ pdf_links = @archive.magazine_pdf_links(target_magazine_link)
69
+ pdf_files = download(pdf_links)
70
+
71
+ pdf_cover = create_cover_pdf(year, volume)
72
+ pdf_files = pdf_files.reverse.push(pdf_cover).reverse
73
+
74
+ if !@pdftool.nil?
75
+ target_pdf_path = "#{@dl_dir}/#{target_pdf}"
76
+ @pdftool.merge_pdf_files(pdf_files, target_pdf_path)
77
+ cleanup
78
+ else
79
+ puts 'brandeins wont merge the single pdf files since it didnt find an appropriate pdf tool' if $BE_VERBOSE
80
+ end
81
+ end
82
+
83
+ def pdf_filename(year, volume)
84
+ "Brand-Eins-#{year}-#{volume}.pdf"
85
+ end
86
+
87
+ def download(pdf_links)
88
+ pdf_links.each_with_object([]) do |pdf_link, pdf_files|
89
+ pdf_filename = @tmp_dir + '/' + File.basename(pdf_link)
90
+ pdf_url = pdf_link
91
+ download_pdf(pdf_url, pdf_filename)
92
+ pdf_files << pdf_filename
93
+ end
94
+ end
95
+
96
+ def download_pdf(pdf_url, pdf_filename)
97
+ if File.exists? pdf_filename
98
+ puts "File #{pdf_filename} seems to be already downloaded" if $BE_VERBOSE
99
+ return true
100
+ end
101
+
102
+ puts "Downloading PDF from #{pdf_url} to #{pdf_filename}" if $BE_VERBOSE
103
+ IO.binwrite(pdf_filename, Net::HTTP.get(URI(pdf_url)))
104
+ end
105
+
106
+ def cleanup
107
+ FileUtils.rm_r @tmp_dir
108
+ end
109
+
110
+ end
111
+
112
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ class Class
4
+
5
+ def subclasses
6
+ ObjectSpace.each_object(Class).select { |klass| klass < self }
7
+ end
8
+
9
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+ require 'brandeins/merger/extensions'
3
+ require 'brandeins/merger/templates/base'
4
+ require 'brandeins/merger/templates/osx'
5
+ require 'brandeins/merger/templates/pdftk_osx'
6
+ require 'brandeins/merger/templates/windows'
7
+ require 'brandeins/merger/templates/ghostscript_windows'
8
+
9
+ module BrandEins
10
+ module Merger
11
+ class PDFTools
12
+
13
+ class << self
14
+ def get_pdf_tool(env = nil)
15
+ @env = env || {}
16
+ @env[:os] ||= RUBY_PLATFORM
17
+
18
+ init_pdf_tools and return_pdf_tool
19
+ end
20
+
21
+ private
22
+ def init_pdf_tools
23
+ template_group = choose_template_group
24
+ @pdf_tools = get_subclasses template_group
25
+ end
26
+
27
+ def choose_template_group
28
+ return BrandEins::Merger::Templates::Windows if @env[:os].include? 'w32'
29
+ return BrandEins::Merger::Templates::OSX if @env[:os].include? 'darwin'
30
+ end
31
+
32
+ def get_subclasses(klass)
33
+ classes = []
34
+ klass.subclasses.each do |sklass|
35
+ classes << sklass.new
36
+ end
37
+ end
38
+
39
+ def return_pdf_tool
40
+ @pdf_tools.first.new if @pdf_tools.length > 0
41
+ end
42
+
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class Base
8
+
9
+ def available?
10
+ _cmd_available? cmd, noop
11
+ end
12
+
13
+ def cmd; raise "Must be implemtented by the subclasses"; end
14
+ def noop; raise "Must be implemtented by the subclasses"; end
15
+ def args; raise "Must be implemtented by the subclasses"; end
16
+
17
+ def merge_pdf_files(pdf_files, target_pdf)
18
+ begin
19
+ pdf_files_arg = pdf_files.map {|pdf_file| "'#{pdf_file}'" }.join ' '
20
+ args = self.args.join(' ').gsub(/__pdf_files__/, pdf_files_arg).gsub(/__target_pdf__/, target_pdf)
21
+ puts "executing: #{cmd} #{args}"
22
+ _exec("#{cmd} #{args}")
23
+ rescue Exception => e
24
+ puts "error: #{e.inspect}"
25
+ return false
26
+ end
27
+ return true
28
+ end
29
+
30
+ private
31
+ def _exec (cmd)
32
+ IO.popen(cmd)
33
+ end
34
+
35
+ def _cmd_available? (cmd, args)
36
+ begin
37
+ open("|#{cmd} #{args}").close
38
+ rescue Exception
39
+ return false
40
+ end
41
+ return true
42
+ end
43
+ end
44
+
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class GhostscriptWin < BrandEins::Merger::Templates::Windows
8
+
9
+ def cmd
10
+ 'gswin64c.exe'
11
+ end
12
+
13
+ def args
14
+ ['-dNOPAUSE', '-dBATCH', '-sDEVICE=pdfwrite', '-sOutputFile=__target_pdf__', '__pdf_files__']
15
+ end
16
+
17
+ def noop
18
+ ['--version']
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class OSX < BrandEins::Merger::Templates::Base; end
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,25 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class PdftkOSX < BrandEins::Merger::Templates::OSX
8
+
9
+ def cmd
10
+ 'pdftk'
11
+ end
12
+
13
+ def args
14
+ ['__pdf_files__', 'output', '__target_pdf__']
15
+ end
16
+
17
+ def noop
18
+ ['--version']
19
+ end
20
+
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+
3
+ module BrandEins
4
+ module Merger
5
+ module Templates
6
+
7
+ class Windows < BrandEins::Merger::Templates::Base; end
8
+
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class ArchiveSite
8
+
9
+ def initialize(base_url, opts = {})
10
+ @base_url = base_url
11
+ @archive_url = @base_url + "/archiv.html"
12
+ if html = opts[:html]
13
+ @doc = Nokogiri::HTML(html)
14
+ end
15
+ end
16
+
17
+ def doc
18
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@archive_url)))
19
+ end
20
+
21
+ def get_magazine_links_by_year(year = 2000)
22
+ puts "Loading Magazine from year #{year}" if $BE_VERBOSE
23
+ magazine_nodes_with_meta = doc.css(".jahrgang-#{year} ul li")
24
+ magazine_nodes_with_meta.each_with_object([]) do |node, links|
25
+ if node['id'].nil? then
26
+ link = node.css('a')
27
+ if link[0].nil? then
28
+ next
29
+ end
30
+ href = link[0]['href']
31
+ links << @base_url + '/' + href
32
+ end
33
+ end
34
+ end
35
+
36
+ def get_magazine_cover(year, volume)
37
+ title = doc.css("#month_detail_#{year}_#{volume} .titel").children[0].to_s
38
+ img_url = ''
39
+ doc.css("#month_detail_#{year}_#{volume} .cover a img").each do |node|
40
+ img_url = node['src']
41
+ end
42
+ return { :title => title, :img_url => @base_url + '/' + img_url }
43
+ end
44
+
45
+ def magazine_pdf_links(url)
46
+ magazine = BrandEins::Parser::MagazineSite.new(url, @base_url)
47
+ magazine.get_magazine_pdf_links
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
54
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class ArticleSite
8
+
9
+ def initialize(url)
10
+ @url = url
11
+ end
12
+
13
+ def doc
14
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
15
+ end
16
+
17
+ def get_pdf_link
18
+ puts "Parsing Article: #{@url}" if $BE_VERBOSE
19
+ link = doc.css("div#sidebar ul li#downloaden a")
20
+ link[0] and link[0]['href']
21
+ end
22
+
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+ require 'nokogiri'
3
+ require 'net/http'
4
+
5
+ module BrandEins
6
+ module Parser
7
+ class MagazineSite
8
+
9
+ def initialize(url, base_url)
10
+ @url, @base_url = url, base_url
11
+ end
12
+
13
+ def doc
14
+ @doc || @doc = Nokogiri::HTML(Net::HTTP.get(URI(@url)))
15
+ end
16
+
17
+ def get_magazine_pdf_links
18
+ puts "Parsing #{@url}" if $BE_VERBOSE
19
+ [get_editorial_article_links, get_schwerpunkt_article_links].flatten
20
+ end
21
+
22
+ def get_schwerpunkt_article_links
23
+ get_links("div.articleList ul h4 a")
24
+ end
25
+
26
+ def get_editorial_article_links
27
+ get_links(".editorial-links li a")
28
+ end
29
+
30
+ def get_links(css_selector)
31
+ link_nodes = doc.css(css_selector)
32
+ link_nodes.each_with_object([]) do |node, links|
33
+ article_link = @base_url + '/' + node['href']
34
+ article = BrandEins::Parser::ArticleSite.new(article_link)
35
+ pdf_link = article.get_pdf_link
36
+ if pdf_link.nil? and $BE_VERBOSE then
37
+ puts "-------------------------------"
38
+ puts "No Content for: #{article_link}"
39
+ puts "-------------------------------"
40
+ else
41
+ links << @base_url + '/' + pdf_link
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ end
@@ -1,13 +1,11 @@
1
- require File.expand_path '../pdf-tools', __FILE__
1
+ # encoding: utf-8
2
2
 
3
3
  module BrandEins
4
4
  class Setup
5
- attr_reader :pdf_tool
6
5
 
7
- def initialize(env = nil)
8
- env = Hash.new if env.nil?
6
+ def initialize(env = {})
9
7
  @os = env[:os] || RUBY_PLATFORM
10
- @pdf_tool = env[:pdf_tool] || BrandEins::PdfTools.get_pdf_tool(env)
8
+ @pdf_tool = env[:pdf_tool] || BrandEins::Merging::PdfTools.get_pdf_tool(env)
11
9
  end
12
10
 
13
11
  def run
@@ -37,4 +35,4 @@ module BrandEins
37
35
  end
38
36
 
39
37
  end
40
- end
38
+ end