log2layout 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -0
- data/README.md +38 -0
- data/lib/excel_processor.rb +60 -0
- data/lib/image_scraper.rb +43 -0
- data/lib/log2layout.rb +16 -0
- data/spec/excel_processor_spec.rb +56 -0
- data/spec/image_scraper_spec.rb +49 -0
- data/spec/spec_helper.rb +3 -0
- metadata +53 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Log2Layout
|
2
|
+
|
3
|
+
## Features:
|
4
|
+
|
5
|
+
+ Command-line image scraper based on an Excel spreadsheet template (aka image log)
|
6
|
+
+ Pulls URLs and figure numbers from Excel spreadsheets, creates "/images" folder in location of image log, downloads image file
|
7
|
+
+ Automatically renames image files to figure numbers
|
8
|
+
+ Keeps original image extension (i.e., no mass .jpg renaming)
|
9
|
+
+ Currently only works with Wikimedia images
|
10
|
+
+ Known bugs: issues with https access, needs handling for Wiki images without Original File link
|
11
|
+
|
12
|
+
|
13
|
+
## Log2Layout Recipe
|
14
|
+
|
15
|
+
1. Install the Log2Layout gem.
|
16
|
+
|
17
|
+
gem install log2layout
|
18
|
+
|
19
|
+
2. Run the gem with the name of your Excel spreadsheet file as the argument.
|
20
|
+
|
21
|
+
log2layout c:/Users/You/Desktop/image_log.xlsx
|
22
|
+
|
23
|
+
3. Magic happens (sometimes the magic can take a little while--these are high res images, after all), and suddenly an "images" folder appears beside your spreadsheet. Ta-da!
|
24
|
+
|
25
|
+
|
26
|
+
## Planned Features for When I'm a Better Programmer:
|
27
|
+
|
28
|
+
+ GUI interface (wxRuby)
|
29
|
+
+ General page scanning: will scan page for a unique <img> tag in the page's HTML and download the available image resources
|
30
|
+
+ Duplicate handling--if it pulls 2+ images, it will: 1) prefix all images with a DUP_ label, 2) highlight the Excel row in red, 3) create and automatically open a text file log (saved to the /images folder) that provides the row number of the Excel spreadsheet that got multiple images, and a list of the image file names to be compared
|
31
|
+
+ Inserts a thumbnail of the image into the spreadsheet (creates column)
|
32
|
+
+ Generates caption text boxes in InDesign with Figure Title paragraph style applied, saved to a /resources folder
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Opens Excel Log
|
2
|
+
|
3
|
+
require 'win32ole'
|
4
|
+
|
5
|
+
class ExcelProcessor
|
6
|
+
|
7
|
+
attr_accessor :spreadsheet, :image_log
|
8
|
+
|
9
|
+
def initialize(spreadsheet, image_log={})
|
10
|
+
@spreadsheet = spreadsheet
|
11
|
+
@image_log = image_log
|
12
|
+
end
|
13
|
+
|
14
|
+
def read_spreadsheet
|
15
|
+
excel = WIN32OLE.new('Excel.Application')
|
16
|
+
excel.visible = false
|
17
|
+
img_log = excel.Workbooks.Open(@spreadsheet)
|
18
|
+
img_ws = img_log.Worksheets(1)
|
19
|
+
|
20
|
+
# Set-up arrays for column values
|
21
|
+
link_array = []
|
22
|
+
figure_array = []
|
23
|
+
|
24
|
+
# Pull values from Figure # column
|
25
|
+
for row in 10..img_ws.UsedRange.Rows.Count do
|
26
|
+
cell = img_ws.Cells(row,3).value.to_s
|
27
|
+
if cell != nil
|
28
|
+
c = cell.sub(/(\D*)/, "fig_")
|
29
|
+
new_cell = c.sub(/(\.)/, "-")
|
30
|
+
else
|
31
|
+
new_cell = ""
|
32
|
+
end
|
33
|
+
|
34
|
+
figure_array << new_cell
|
35
|
+
end
|
36
|
+
|
37
|
+
# Pull values from Source URL column
|
38
|
+
for row in 10..img_ws.UsedRange.Rows.Count do
|
39
|
+
cell = img_ws.Cells(row,4).Value
|
40
|
+
if (cell != nil) && (cell.include?("."))
|
41
|
+
new_cell = cell
|
42
|
+
else
|
43
|
+
new_cell = ""
|
44
|
+
end
|
45
|
+
|
46
|
+
link_array << new_cell
|
47
|
+
end
|
48
|
+
|
49
|
+
# Zip arrays into hash of figure numbers and links
|
50
|
+
@image_log = Hash[figure_array.zip(link_array)]
|
51
|
+
|
52
|
+
# Remove blanks
|
53
|
+
@image_log.delete_if { |k, v| k == "" }
|
54
|
+
|
55
|
+
# Shut it down
|
56
|
+
excel.ActiveWorkbook.Close(0)
|
57
|
+
excel.Quit
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require "#{File.dirname(__FILE__)}/./excel_processor"
|
2
|
+
require 'mechanize'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'logger'
|
5
|
+
|
6
|
+
class ImageScraper
|
7
|
+
|
8
|
+
attr_accessor :log, :location
|
9
|
+
|
10
|
+
def initialize log, location
|
11
|
+
@log = log
|
12
|
+
@location = location
|
13
|
+
end
|
14
|
+
|
15
|
+
def create_dir
|
16
|
+
if Dir.exists?("#{File.dirname(@location)}/images")
|
17
|
+
puts "Exists!"
|
18
|
+
else
|
19
|
+
Dir.mkdir("#{File.dirname(@location)}/images")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def scrape
|
24
|
+
site_seeker = Mechanize.new { |a| a.log = Logger.new("#{File.dirname(@location)}/images/scrape_summary.log") }
|
25
|
+
#For Wiki only
|
26
|
+
@log.each do |fig, link|
|
27
|
+
ext = link.scan(/\..{3,4}$/)
|
28
|
+
begin
|
29
|
+
page = site_seeker.get(link)
|
30
|
+
puts "Found #{fig} at #{link}"
|
31
|
+
img_page = page.links_with(:text => "Original file").first.click
|
32
|
+
puts "Located original image"
|
33
|
+
img = img_page.save!("#{File.dirname(@location)}/images/#{fig}#{ext[0]}")
|
34
|
+
puts "#{fig} has been saved."
|
35
|
+
rescue
|
36
|
+
puts "Uh-oh, something went wrong with #{fig.upcase}"
|
37
|
+
next
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
data/lib/log2layout.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative './image_scraper.rb'
|
4
|
+
require_relative './excel_processor.rb'
|
5
|
+
|
6
|
+
#TO RUN MANUALLY ON THE COMMAND LINE
|
7
|
+
|
8
|
+
# excel_path = ARGV[0]
|
9
|
+
|
10
|
+
# @book = ExcelProcessor.new(excel_path)
|
11
|
+
# @book.read_spreadsheet
|
12
|
+
# @book.image_log
|
13
|
+
|
14
|
+
# @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
|
15
|
+
# @image_scraper.create_dir
|
16
|
+
# @image_scraper.scrape
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
require "#{File.dirname(__FILE__)}/../lib/excel_processor"
|
3
|
+
|
4
|
+
describe ExcelProcessor do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
|
8
|
+
end
|
9
|
+
|
10
|
+
context "#new" do
|
11
|
+
it "should have the spreadsheet point to a file that exists" do
|
12
|
+
File.exist?(@book.spreadsheet).should be_true
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should create an empty hash for image log" do
|
16
|
+
@book.image_log.should be_a_kind_of(Hash)
|
17
|
+
@book.image_log.length.should eq 0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "#read_spreadsheet" do
|
22
|
+
it "should open Excel and populate image log" do
|
23
|
+
@book.read_spreadsheet
|
24
|
+
@book.image_log.should_not be_empty
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should start with 'fig_'" do
|
28
|
+
@book.read_spreadsheet
|
29
|
+
@book.image_log.has_key?("fig_8-8").should be_true
|
30
|
+
@book.image_log.has_key?("fig_2").should be_true
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should replace periods between numbers with dashes" do
|
34
|
+
@book.read_spreadsheet
|
35
|
+
@book.image_log.has_key?("fig_1-1").should be_true
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should match the right figure numbers with the right links" do
|
39
|
+
@book.read_spreadsheet
|
40
|
+
@book.image_log.fetch("fig_1-1").should eq "http://commons.wikimedia.org/wiki/File:Basketry-covered_lightbulb_01.jpg"
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should remove any pairs with blank values" do
|
44
|
+
@book.read_spreadsheet
|
45
|
+
@book.image_log.has_key?("").should be_false
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should return an @image_log length of 4" do
|
49
|
+
@book.read_spreadsheet
|
50
|
+
@book.image_log.length.should eq 4
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
require "#{File.dirname(__FILE__)}/../lib/image_scraper"
|
3
|
+
require "#{File.dirname(__FILE__)}/../lib/excel_processor"
|
4
|
+
require "mechanize"
|
5
|
+
|
6
|
+
describe ImageScraper do
|
7
|
+
|
8
|
+
before :all do
|
9
|
+
@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
|
10
|
+
@book.read_spreadsheet
|
11
|
+
@image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
|
12
|
+
end
|
13
|
+
|
14
|
+
context "#new" do
|
15
|
+
it "should do receive @image_log from the Excel Processor object" do
|
16
|
+
@image_scraper.log.should be_a_kind_of(Hash)
|
17
|
+
@image_scraper.log.should_not be_empty
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should receive the location of the spreadsheet" do
|
21
|
+
@image_scraper.location.should == "#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
context "#create_dir" do
|
26
|
+
it "should create a folder called 'images'" do
|
27
|
+
@image_scraper.create_dir
|
28
|
+
img_dir = File.dirname(@image_scraper.location)
|
29
|
+
File.exist?(img_dir).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "#scrape" do
|
34
|
+
it "should handle all errors" do
|
35
|
+
@final_log = @image_scraper.scrape
|
36
|
+
expect { @final_log }.not_to raise_error
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should save the image file" do
|
40
|
+
@final_log
|
41
|
+
File.exist?("#{File.dirname(@image_scraper.location)}/images/fig_1-1.jpg").should be_true
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should create a scraping summary log file" do
|
45
|
+
File.exist?("#{File.dirname(@image_scraper.location)}/images/scrape_summary.log").should be_true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: log2layout
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sarah W.
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-02-15 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Pulls Wiki URLs from Excel spreadsheet and automates image downloading.
|
15
|
+
email: sarahcwheeler@smail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- README.md
|
21
|
+
- Gemfile
|
22
|
+
- spec/excel_processor_spec.rb
|
23
|
+
- spec/image_scraper_spec.rb
|
24
|
+
- spec/spec_helper.rb
|
25
|
+
- lib/excel_processor.rb
|
26
|
+
- lib/image_scraper.rb
|
27
|
+
- lib/log2layout.rb
|
28
|
+
homepage: http://rubygems.org/gems/log2layout
|
29
|
+
licenses:
|
30
|
+
- MIT
|
31
|
+
post_install_message:
|
32
|
+
rdoc_options: []
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
none: false
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ! '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
requirements: []
|
48
|
+
rubyforge_project:
|
49
|
+
rubygems_version: 1.8.28
|
50
|
+
signing_key:
|
51
|
+
specification_version: 3
|
52
|
+
summary: Scraper for Wikimedia images.
|
53
|
+
test_files: []
|