log2layout 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ # gem "rails"
5
+ gem 'mechanize'
6
+ gem 'rspec'
@@ -0,0 +1,38 @@
1
+ # Log2Layout
2
+
3
+ ## Features:
4
+
5
+ + Command-line image scraper based on an Excel spreadsheet template (aka image log)
6
+ + Pulls URLs and figure numbers from Excel spreadsheets, creates "/images" folder in location of image log, downloads image file
7
+ + Automatically renames image files to figure numbers
8
+ + Keeps original image extension (i.e., no mass .jpg renaming)
9
+ + Currently only works with Wikimedia images
10
+ + Known bugs: issues with https access, needs handling for Wiki images without Original File link
11
+
12
+
13
+ ## Log2Layout Recipe
14
+
15
+ 1. Install the Log2Layout gem.
16
+
17
+ gem install log2layout
18
+
19
+ 2. Run the gem with the name of your Excel spreadsheet file as the argument.
20
+
21
+ log2layout c:/Users/You/Desktop/image_log.xlsx
22
+
23
+ 3. Magic happens (sometimes the magic can take a little while--these are high res images, after all), and suddenly an "images" folder appears beside your spreadsheet. Ta-da!
24
+
25
+
26
+ ## Planned Features for When I'm a Better Programmer:
27
+
28
+ + GUI interface (wxRuby)
29
+ + General page scanning: will scan page for a unique <img> tag in the page's HTML and download the available image resources
30
+ + Duplicate handling--if it pulls 2+ images, it will: 1) prefix all images with a DUP_ label, 2) highlight the Excel row in red, 3) create and automatically open a text file log (saved to the /images folder) that provides the row number of the Excel spreadsheet that got multiple images, and a list of the image file names to be compared
31
+ + Inserts a thumbnail of the image into the spreadsheet (creates column)
32
+ + Generates caption text boxes in InDesign with Figure Title paragraph style applied, saved to a /resources folder
33
+
34
+
35
+
36
+
37
+
38
+
@@ -0,0 +1,60 @@
1
+ # Opens Excel Log
2
+
3
+ require 'win32ole'
4
+
5
+ class ExcelProcessor
6
+
7
+ attr_accessor :spreadsheet, :image_log
8
+
9
+ def initialize(spreadsheet, image_log={})
10
+ @spreadsheet = spreadsheet
11
+ @image_log = image_log
12
+ end
13
+
14
+ def read_spreadsheet
15
+ excel = WIN32OLE.new('Excel.Application')
16
+ excel.visible = false
17
+ img_log = excel.Workbooks.Open(@spreadsheet)
18
+ img_ws = img_log.Worksheets(1)
19
+
20
+ # Set-up arrays for column values
21
+ link_array = []
22
+ figure_array = []
23
+
24
+ # Pull values from Figure # column
25
+ for row in 10..img_ws.UsedRange.Rows.Count do
26
+ cell = img_ws.Cells(row,3).value.to_s
27
+ if cell != nil
28
+ c = cell.sub(/(\D*)/, "fig_")
29
+ new_cell = c.sub(/(\.)/, "-")
30
+ else
31
+ new_cell = ""
32
+ end
33
+
34
+ figure_array << new_cell
35
+ end
36
+
37
+ # Pull values from Source URL column
38
+ for row in 10..img_ws.UsedRange.Rows.Count do
39
+ cell = img_ws.Cells(row,4).Value
40
+ if (cell != nil) && (cell.include?("."))
41
+ new_cell = cell
42
+ else
43
+ new_cell = ""
44
+ end
45
+
46
+ link_array << new_cell
47
+ end
48
+
49
+ # Zip arrays into hash of figure numbers and links
50
+ @image_log = Hash[figure_array.zip(link_array)]
51
+
52
+ # Remove blanks
53
+ @image_log.delete_if { |k, v| k == "" }
54
+
55
+ # Shut it down
56
+ excel.ActiveWorkbook.Close(0)
57
+ excel.Quit
58
+ end
59
+
60
+ end
@@ -0,0 +1,43 @@
1
+ require "#{File.dirname(__FILE__)}/./excel_processor"
2
+ require 'mechanize'
3
+ require 'open-uri'
4
+ require 'logger'
5
+
6
+ class ImageScraper
7
+
8
+ attr_accessor :log, :location
9
+
10
+ def initialize log, location
11
+ @log = log
12
+ @location = location
13
+ end
14
+
15
+ def create_dir
16
+ if Dir.exists?("#{File.dirname(@location)}/images")
17
+ puts "Exists!"
18
+ else
19
+ Dir.mkdir("#{File.dirname(@location)}/images")
20
+ end
21
+ end
22
+
23
+ def scrape
24
+ site_seeker = Mechanize.new { |a| a.log = Logger.new("#{File.dirname(@location)}/images/scrape_summary.log") }
25
+ #For Wiki only
26
+ @log.each do |fig, link|
27
+ ext = link.scan(/\..{3,4}$/)
28
+ begin
29
+ page = site_seeker.get(link)
30
+ puts "Found #{fig} at #{link}"
31
+ img_page = page.links_with(:text => "Original file").first.click
32
+ puts "Located original image"
33
+ img = img_page.save!("#{File.dirname(@location)}/images/#{fig}#{ext[0]}")
34
+ puts "#{fig} has been saved."
35
+ rescue
36
+ puts "Uh-oh, something went wrong with #{fig.upcase}"
37
+ next
38
+ end
39
+ end
40
+ end
41
+
42
+ end
43
+
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative './image_scraper.rb'
4
+ require_relative './excel_processor.rb'
5
+
6
+ #TO RUN MANUALLY ON THE COMMAND LINE
7
+
8
+ # excel_path = ARGV[0]
9
+
10
+ # @book = ExcelProcessor.new(excel_path)
11
+ # @book.read_spreadsheet
12
+ # @book.image_log
13
+
14
+ # @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
15
+ # @image_scraper.create_dir
16
+ # @image_scraper.scrape
@@ -0,0 +1,56 @@
1
+ require 'spec_helper.rb'
2
+ require "#{File.dirname(__FILE__)}/../lib/excel_processor"
3
+
4
+ describe ExcelProcessor do
5
+
6
+ before do
7
+ @book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
8
+ end
9
+
10
+ context "#new" do
11
+ it "should have the spreadsheet point to a file that exists" do
12
+ File.exist?(@book.spreadsheet).should be_true
13
+ end
14
+
15
+ it "should create an empty hash for image log" do
16
+ @book.image_log.should be_a_kind_of(Hash)
17
+ @book.image_log.length.should eq 0
18
+ end
19
+ end
20
+
21
+ context "#read_spreadsheet" do
22
+ it "should open Excel and populate image log" do
23
+ @book.read_spreadsheet
24
+ @book.image_log.should_not be_empty
25
+ end
26
+
27
+ it "should start with 'fig_'" do
28
+ @book.read_spreadsheet
29
+ @book.image_log.has_key?("fig_8-8").should be_true
30
+ @book.image_log.has_key?("fig_2").should be_true
31
+ end
32
+
33
+ it "should replace periods between numbers with dashes" do
34
+ @book.read_spreadsheet
35
+ @book.image_log.has_key?("fig_1-1").should be_true
36
+ end
37
+
38
+ it "should match the right figure numbers with the right links" do
39
+ @book.read_spreadsheet
40
+ @book.image_log.fetch("fig_1-1").should eq "http://commons.wikimedia.org/wiki/File:Basketry-covered_lightbulb_01.jpg"
41
+ end
42
+
43
+ it "should remove any pairs with blank values" do
44
+ @book.read_spreadsheet
45
+ @book.image_log.has_key?("").should be_false
46
+ end
47
+
48
+ it "should return an @image_log length of 4" do
49
+ @book.read_spreadsheet
50
+ @book.image_log.length.should eq 4
51
+ end
52
+
53
+ end
54
+
55
+
56
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper.rb'
2
+ require "#{File.dirname(__FILE__)}/../lib/image_scraper"
3
+ require "#{File.dirname(__FILE__)}/../lib/excel_processor"
4
+ require "mechanize"
5
+
6
+ describe ImageScraper do
7
+
8
+ before :all do
9
+ @book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
10
+ @book.read_spreadsheet
11
+ @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
12
+ end
13
+
14
+ context "#new" do
15
+ it "should do receive @image_log from the Excel Processor object" do
16
+ @image_scraper.log.should be_a_kind_of(Hash)
17
+ @image_scraper.log.should_not be_empty
18
+ end
19
+
20
+ it "should receive the location of the spreadsheet" do
21
+ @image_scraper.location.should == "#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx"
22
+ end
23
+ end
24
+
25
+ context "#create_dir" do
26
+ it "should create a folder called 'images'" do
27
+ @image_scraper.create_dir
28
+ img_dir = File.dirname(@image_scraper.location)
29
+ File.exist?(img_dir).should be_true
30
+ end
31
+ end
32
+
33
+ context "#scrape" do
34
+ it "should handle all errors" do
35
+ @final_log = @image_scraper.scrape
36
+ expect { @final_log }.not_to raise_error
37
+ end
38
+
39
+ it "should save the image file" do
40
+ @final_log
41
+ File.exist?("#{File.dirname(@image_scraper.location)}/images/fig_1-1.jpg").should be_true
42
+ end
43
+
44
+ it "should create a scraping summary log file" do
45
+ File.exist?("#{File.dirname(@image_scraper.location)}/images/scrape_summary.log").should be_true
46
+ end
47
+ end
48
+
49
+ end
@@ -0,0 +1,3 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+
3
+ require 'log2layout.rb'
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log2layout
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sarah W.
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-15 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Pulls Wiki URLs from Excel spreadsheet and automates image downloading.
15
+ email: sarahcwheeler@smail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - Gemfile
22
+ - spec/excel_processor_spec.rb
23
+ - spec/image_scraper_spec.rb
24
+ - spec/spec_helper.rb
25
+ - lib/excel_processor.rb
26
+ - lib/image_scraper.rb
27
+ - lib/log2layout.rb
28
+ homepage: http://rubygems.org/gems/log2layout
29
+ licenses:
30
+ - MIT
31
+ post_install_message:
32
+ rdoc_options: []
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 1.8.28
50
+ signing_key:
51
+ specification_version: 3
52
+ summary: Scraper for Wikimedia images.
53
+ test_files: []