log2layout 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +6 -0
- data/README.md +38 -0
- data/lib/excel_processor.rb +60 -0
- data/lib/image_scraper.rb +43 -0
- data/lib/log2layout.rb +16 -0
- data/spec/excel_processor_spec.rb +56 -0
- data/spec/image_scraper_spec.rb +49 -0
- data/spec/spec_helper.rb +3 -0
- metadata +53 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Log2Layout
|
2
|
+
|
3
|
+
## Features:
|
4
|
+
|
5
|
+
+ Command-line image scraper based on an Excel spreadsheet template (aka image log)
|
6
|
+
+ Pulls URLs and figure numbers from Excel spreadsheets, creates "/images" folder in location of image log, downloads image file
|
7
|
+
+ Automatically renames image files to figure numbers
|
8
|
+
+ Keeps original image extension (i.e., no mass .jpg renaming)
|
9
|
+
+ Currently only works with Wikimedia images
|
10
|
+
+ Known bugs: issues with https access, needs handling for Wiki images without Original File link
|
11
|
+
|
12
|
+
|
13
|
+
## Log2Layout Recipe
|
14
|
+
|
15
|
+
1. Install the Log2Layout gem.
|
16
|
+
|
17
|
+
gem install log2layout
|
18
|
+
|
19
|
+
2. Run the gem with the name of your Excel spreadsheet file as the argument.
|
20
|
+
|
21
|
+
log2layout c:/Users/You/Desktop/image_log.xlsx
|
22
|
+
|
23
|
+
3. Magic happens (sometimes the magic can take a little while--these are high res images, after all), and suddenly an "images" folder appears beside your spreadsheet. Ta-da!
|
24
|
+
|
25
|
+
|
26
|
+
## Planned Features for When I'm a Better Programmer:
|
27
|
+
|
28
|
+
+ GUI interface (wxRuby)
|
29
|
+
+ General page scanning: will scan page for a unique <img> tag in the page's HTML and download the available image resources
|
30
|
+
+ Duplicate handling--if it pulls 2+ images, it will: 1) prefix all images with a DUP_ label, 2) highlight the Excel row in red, 3) create and automatically open a text file log (saved to the /images folder) that provides the row number of the Excel spreadsheet that got multiple images, and a list of the image file names to be compared
|
31
|
+
+ Inserts a thumbnail of the image into the spreadsheet (creates column)
|
32
|
+
+ Generates caption text boxes in InDesign with Figure Title paragraph style applied, saved to a /resources folder
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Opens Excel Log
|
2
|
+
|
3
|
+
require 'win32ole'
|
4
|
+
|
5
|
+
class ExcelProcessor
|
6
|
+
|
7
|
+
attr_accessor :spreadsheet, :image_log
|
8
|
+
|
9
|
+
def initialize(spreadsheet, image_log={})
|
10
|
+
@spreadsheet = spreadsheet
|
11
|
+
@image_log = image_log
|
12
|
+
end
|
13
|
+
|
14
|
+
def read_spreadsheet
|
15
|
+
excel = WIN32OLE.new('Excel.Application')
|
16
|
+
excel.visible = false
|
17
|
+
img_log = excel.Workbooks.Open(@spreadsheet)
|
18
|
+
img_ws = img_log.Worksheets(1)
|
19
|
+
|
20
|
+
# Set-up arrays for column values
|
21
|
+
link_array = []
|
22
|
+
figure_array = []
|
23
|
+
|
24
|
+
# Pull values from Figure # column
|
25
|
+
for row in 10..img_ws.UsedRange.Rows.Count do
|
26
|
+
cell = img_ws.Cells(row,3).value.to_s
|
27
|
+
if cell != nil
|
28
|
+
c = cell.sub(/(\D*)/, "fig_")
|
29
|
+
new_cell = c.sub(/(\.)/, "-")
|
30
|
+
else
|
31
|
+
new_cell = ""
|
32
|
+
end
|
33
|
+
|
34
|
+
figure_array << new_cell
|
35
|
+
end
|
36
|
+
|
37
|
+
# Pull values from Source URL column
|
38
|
+
for row in 10..img_ws.UsedRange.Rows.Count do
|
39
|
+
cell = img_ws.Cells(row,4).Value
|
40
|
+
if (cell != nil) && (cell.include?("."))
|
41
|
+
new_cell = cell
|
42
|
+
else
|
43
|
+
new_cell = ""
|
44
|
+
end
|
45
|
+
|
46
|
+
link_array << new_cell
|
47
|
+
end
|
48
|
+
|
49
|
+
# Zip arrays into hash of figure numbers and links
|
50
|
+
@image_log = Hash[figure_array.zip(link_array)]
|
51
|
+
|
52
|
+
# Remove blanks
|
53
|
+
@image_log.delete_if { |k, v| k == "" }
|
54
|
+
|
55
|
+
# Shut it down
|
56
|
+
excel.ActiveWorkbook.Close(0)
|
57
|
+
excel.Quit
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require "#{File.dirname(__FILE__)}/./excel_processor"
|
2
|
+
require 'mechanize'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
class ImageScraper
|
7
|
+
|
8
|
+
attr_accessor :log, :location
|
9
|
+
|
10
|
+
def initialize log, location
|
11
|
+
@log = log
|
12
|
+
@location = location
|
13
|
+
end
|
14
|
+
|
15
|
+
def create_dir
|
16
|
+
if Dir.exists?("#{File.dirname(@location)}/images")
|
17
|
+
puts "Exists!"
|
18
|
+
else
|
19
|
+
Dir.mkdir("#{File.dirname(@location)}/images")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def scrape
|
24
|
+
site_seeker = Mechanize.new { |a| a.log = Logger.new("#{File.dirname(@location)}/images/scrape_summary.log") }
|
25
|
+
#For Wiki only
|
26
|
+
@log.each do |fig, link|
|
27
|
+
ext = link.scan(/\..{3,4}$/)
|
28
|
+
begin
|
29
|
+
page = site_seeker.get(link)
|
30
|
+
puts "Found #{fig} at #{link}"
|
31
|
+
img_page = page.links_with(:text => "Original file").first.click
|
32
|
+
puts "Located original image"
|
33
|
+
img = img_page.save!("#{File.dirname(@location)}/images/#{fig}#{ext[0]}")
|
34
|
+
puts "#{fig} has been saved."
|
35
|
+
rescue
|
36
|
+
puts "Uh-oh, something went wrong with #{fig.upcase}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
data/lib/log2layout.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative './image_scraper.rb'
|
4
|
+
require_relative './excel_processor.rb'
|
5
|
+
|
6
|
+
#TO RUN MANUALLY ON THE COMMAND LINE
|
7
|
+
|
8
|
+
# excel_path = ARGV[0]
|
9
|
+
|
10
|
+
# @book = ExcelProcessor.new(excel_path)
|
11
|
+
# @book.read_spreadsheet
|
12
|
+
# @book.image_log
|
13
|
+
|
14
|
+
# @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
|
15
|
+
# @image_scraper.create_dir
|
16
|
+
# @image_scraper.scrape
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
require "#{File.dirname(__FILE__)}/../lib/excel_processor"
|
3
|
+
|
4
|
+
describe ExcelProcessor do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
|
8
|
+
end
|
9
|
+
|
10
|
+
context "#new" do
|
11
|
+
it "should have the spreadsheet point to a file that exists" do
|
12
|
+
File.exist?(@book.spreadsheet).should be_true
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should create an empty hash for image log" do
|
16
|
+
@book.image_log.should be_a_kind_of(Hash)
|
17
|
+
@book.image_log.length.should eq 0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "#read_spreadsheet" do
|
22
|
+
it "should open Excel and populate image log" do
|
23
|
+
@book.read_spreadsheet
|
24
|
+
@book.image_log.should_not be_empty
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should start with 'fig_'" do
|
28
|
+
@book.read_spreadsheet
|
29
|
+
@book.image_log.has_key?("fig_8-8").should be_true
|
30
|
+
@book.image_log.has_key?("fig_2").should be_true
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should replace periods between numbers with dashes" do
|
34
|
+
@book.read_spreadsheet
|
35
|
+
@book.image_log.has_key?("fig_1-1").should be_true
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should match the right figure numbers with the right links" do
|
39
|
+
@book.read_spreadsheet
|
40
|
+
@book.image_log.fetch("fig_1-1").should eq "http://commons.wikimedia.org/wiki/File:Basketry-covered_lightbulb_01.jpg"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should remove any pairs with blank values" do
|
44
|
+
@book.read_spreadsheet
|
45
|
+
@book.image_log.has_key?("").should be_false
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should return an @image_log length of 4" do
|
49
|
+
@book.read_spreadsheet
|
50
|
+
@book.image_log.length.should eq 4
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
require "#{File.dirname(__FILE__)}/../lib/image_scraper"
|
3
|
+
require "#{File.dirname(__FILE__)}/../lib/excel_processor"
|
4
|
+
require "mechanize"
|
5
|
+
|
6
|
+
describe ImageScraper do
|
7
|
+
|
8
|
+
before :all do
|
9
|
+
@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
|
10
|
+
@book.read_spreadsheet
|
11
|
+
@image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
|
12
|
+
end
|
13
|
+
|
14
|
+
context "#new" do
|
15
|
+
it "should do receive @image_log from the Excel Processor object" do
|
16
|
+
@image_scraper.log.should be_a_kind_of(Hash)
|
17
|
+
@image_scraper.log.should_not be_empty
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should receive the location of the spreadsheet" do
|
21
|
+
@image_scraper.location.should == "#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "#create_dir" do
|
26
|
+
it "should create a folder called 'images'" do
|
27
|
+
@image_scraper.create_dir
|
28
|
+
img_dir = File.dirname(@image_scraper.location)
|
29
|
+
File.exist?(img_dir).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "#scrape" do
|
34
|
+
it "should handle all errors" do
|
35
|
+
@final_log = @image_scraper.scrape
|
36
|
+
expect { @final_log }.not_to raise_error
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should save the image file" do
|
40
|
+
@final_log
|
41
|
+
File.exist?("#{File.dirname(@image_scraper.location)}/images/fig_1-1.jpg").should be_true
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should create a scraping summary log file" do
|
45
|
+
File.exist?("#{File.dirname(@image_scraper.location)}/images/scrape_summary.log").should be_true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: log2layout
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sarah W.
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-02-15 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Pulls Wiki URLs from Excel spreadsheet and automates image downloading.
|
15
|
+
email: sarahcwheeler@smail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- Gemfile
|
22
|
+
- spec/excel_processor_spec.rb
|
23
|
+
- spec/image_scraper_spec.rb
|
24
|
+
- spec/spec_helper.rb
|
25
|
+
- lib/excel_processor.rb
|
26
|
+
- lib/image_scraper.rb
|
27
|
+
- lib/log2layout.rb
|
28
|
+
homepage: http://rubygems.org/gems/log2layout
|
29
|
+
licenses:
|
30
|
+
- MIT
|
31
|
+
post_install_message:
|
32
|
+
rdoc_options: []
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
requirements: []
|
48
|
+
rubyforge_project:
|
49
|
+
rubygems_version: 1.8.28
|
50
|
+
signing_key:
|
51
|
+
specification_version: 3
|
52
|
+
summary: Scraper for Wikimedia images.
|
53
|
+
test_files: []
|