RubyGems - log2layout - Versions diffs - 0.0.1 - Mend

log2layout 0.0.1

Files changed (9) hide show

data/Gemfile +6 -0
data/README.md +38 -0
data/lib/excel_processor.rb +60 -0
data/lib/image_scraper.rb +43 -0
data/lib/log2layout.rb +16 -0
data/spec/excel_processor_spec.rb +56 -0
data/spec/image_scraper_spec.rb +49 -0
data/spec/spec_helper.rb +3 -0
metadata +53 -0

data/Gemfile ADDED

@@ -0,0 +1,6 @@
+# A sample Gemfile
+source "https://rubygems.org"
+# gem "rails"
+gem 'mechanize'
+gem 'rspec'

data/README.md ADDED

@@ -0,0 +1,38 @@
+# Log2Layout
+## Features:
++ Command-line image scraper based on an Excel spreadsheet template (aka image log)
++ Pulls URLs and figure numbers from Excel spreadsheets, creates "/images" folder in location of image log, downloads image file
++ Automatically renames image files to figure numbers
++ Keeps original image extension (i.e., no mass .jpg renaming)
++ Currently only works with Wikimedia images
++ Known bugs: issues with https access, needs handling for Wiki images without Original File link
+## Log2Layout Recipe
+1. Install the Log2Layout gem.
+	gem install log2layout
+2. Run the gem with the name of your Excel spreadsheet file as the argument.
+	log2layout c:/Users/You/Desktop/image_log.xlsx
+3. Magic happens (sometimes the magic can take a little while--these are high res images, after all), and suddenly an "images" folder appears beside your spreadsheet. Ta-da!
+## Planned Features for When I'm a Better Programmer:
++ GUI interface (wxRuby)
++ General page scanning: will scan page for a unique <img> tag in the page's HTML and download the available image resources
++ Duplicate handling--if it pulls 2+ images, it will: 1) prefix all images with a DUP_ label, 2) highlight the Excel row in red, 3) create and automatically open a  text file log (saved to the /images folder) that provides the row number of the Excel spreadsheet that got multiple images, and a list of the image file names to be compared
++ Inserts a thumbnail of the image into the spreadsheet (creates column)
++ Generates caption text boxes in InDesign with Figure Title paragraph style applied, saved to a /resources folder

data/lib/excel_processor.rb ADDED

@@ -0,0 +1,60 @@
+ # Opens Excel Log
+require 'win32ole'
+class ExcelProcessor
+	attr_accessor :spreadsheet, :image_log
+	def initialize(spreadsheet, image_log={})
+		@spreadsheet = spreadsheet
+		@image_log = image_log
+	end
+	def read_spreadsheet
+		excel = WIN32OLE.new('Excel.Application')
+		excel.visible = false
+		img_log = excel.Workbooks.Open(@spreadsheet)
+		img_ws = img_log.Worksheets(1)
+		 	# Set-up arrays for column values
+			link_array = []
+			figure_array = []
+			# Pull values from Figure # column
+			for row in 10..img_ws.UsedRange.Rows.Count do
+			 	cell = img_ws.Cells(row,3).value.to_s
+			 	if cell != nil
+			 		c = cell.sub(/(\D*)/, "fig_")
+					new_cell = c.sub(/(\.)/, "-")
+			 	else
+			 		new_cell = ""
+			 	end
+			 	figure_array << new_cell
+			end
+			# Pull values from Source URL column
+			for row in 10..img_ws.UsedRange.Rows.Count do
+			 	cell = img_ws.Cells(row,4).Value
+				if (cell != nil) && (cell.include?("."))
+			 		new_cell = cell
+			 	else
+			 		new_cell = ""
+			 	end
+			 	link_array << new_cell
+			end
+			# Zip arrays into hash of figure numbers and links
+			@image_log = Hash[figure_array.zip(link_array)]
+			# Remove blanks
+			@image_log.delete_if { |k, v| k == "" }
+		# Shut it down
+		excel.ActiveWorkbook.Close(0)
+		excel.Quit
+	end
+end

data/lib/image_scraper.rb ADDED

@@ -0,0 +1,43 @@
+require "#{File.dirname(__FILE__)}/./excel_processor"
+require 'mechanize'
+require 'open-uri'
+require 'logger'
+class ImageScraper
+	attr_accessor :log, :location
+	def initialize log, location
+		@log = log
+		@location = location
+	end
+	def create_dir
+		if Dir.exists?("#{File.dirname(@location)}/images")
+			puts "Exists!"
+		else
+			Dir.mkdir("#{File.dirname(@location)}/images")
+		end
+	end
+	def scrape
+		site_seeker = Mechanize.new { |a| a.log = Logger.new("#{File.dirname(@location)}/images/scrape_summary.log") }
+		#For Wiki only
+		@log.each do |fig, link|
+				ext = link.scan(/\..{3,4}$/)
+				begin
+					page = site_seeker.get(link)
+					puts "Found #{fig} at #{link}"
+					img_page = page.links_with(:text => "Original file").first.click
+					puts "Located original image"
+					img = img_page.save!("#{File.dirname(@location)}/images/#{fig}#{ext[0]}")
+					puts "#{fig} has been saved."
+				rescue
+					puts "Uh-oh, something went wrong with #{fig.upcase}"
+					next
+				end
+		end
+	end
+end

data/lib/log2layout.rb ADDED

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+require_relative './image_scraper.rb'
+require_relative './excel_processor.rb'
+#TO RUN MANUALLY ON THE COMMAND LINE
+# excel_path = ARGV[0]
+# @book = ExcelProcessor.new(excel_path)
+# @book.read_spreadsheet
+# @book.image_log
+# @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
+# @image_scraper.create_dir
+# @image_scraper.scrape

data/spec/excel_processor_spec.rb ADDED

@@ -0,0 +1,56 @@
+require 'spec_helper.rb'
+require "#{File.dirname(__FILE__)}/../lib/excel_processor"
+describe ExcelProcessor do
+	before do
+		@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
+	end
+	context "#new" do
+		it "should have the spreadsheet point to a file that exists" do
+			File.exist?(@book.spreadsheet).should be_true
+		end
+		it "should create an empty hash for image log" do
+			@book.image_log.should be_a_kind_of(Hash)
+			@book.image_log.length.should eq 0
+		end
+	end
+	context "#read_spreadsheet" do
+		it "should open Excel and populate image log" do
+			@book.read_spreadsheet
+			@book.image_log.should_not be_empty
+		end
+		it "should start with 'fig_'" do
+			@book.read_spreadsheet
+			@book.image_log.has_key?("fig_8-8").should be_true
+			@book.image_log.has_key?("fig_2").should be_true
+		end
+		it "should replace periods between numbers with dashes" do
+			@book.read_spreadsheet
+			@book.image_log.has_key?("fig_1-1").should be_true
+		end
+		it "should match the right figure numbers with the right links" do
+			@book.read_spreadsheet
+			@book.image_log.fetch("fig_1-1").should eq "http://commons.wikimedia.org/wiki/File:Basketry-covered_lightbulb_01.jpg"
+		end
+		it "should remove any pairs with blank values" do
+			@book.read_spreadsheet
+			@book.image_log.has_key?("").should be_false
+		end
+		it "should return an @image_log length of 4" do
+			@book.read_spreadsheet
+			@book.image_log.length.should eq 4
+		end
+	end
+end

data/spec/image_scraper_spec.rb ADDED

@@ -0,0 +1,49 @@
+require 'spec_helper.rb'
+require "#{File.dirname(__FILE__)}/../lib/image_scraper"
+require "#{File.dirname(__FILE__)}/../lib/excel_processor"
+require "mechanize"
+describe ImageScraper do
+	before :all do
+		@book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
+		@book.read_spreadsheet
+		@image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
+	end
+	context "#new" do
+		it "should do receive @image_log from the Excel Processor object" do
+			@image_scraper.log.should be_a_kind_of(Hash)
+			@image_scraper.log.should_not be_empty
+		end
+		it "should receive the location of the spreadsheet" do
+			@image_scraper.location.should == "#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx"
+		end
+	end
+	context "#create_dir" do
+		it "should create a folder called 'images'" do
+			@image_scraper.create_dir
+			img_dir = File.dirname(@image_scraper.location)
+			File.exist?(img_dir).should be_true
+		end
+	end
+	context "#scrape" do
+		it "should handle all errors" do
+			@final_log = @image_scraper.scrape
+			expect { @final_log }.not_to raise_error
+		end
+		it "should save the image file" do
+			@final_log
+			File.exist?("#{File.dirname(@image_scraper.location)}/images/fig_1-1.jpg").should be_true
+		end
+		it "should create a scraping summary log file" do
+			File.exist?("#{File.dirname(@image_scraper.location)}/images/scrape_summary.log").should be_true
+		end
+	end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,3 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'log2layout.rb'

metadata ADDED

@@ -0,0 +1,53 @@
+--- !ruby/object:Gem::Specification
+name: log2layout
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Sarah W.
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-02-15 00:00:00.000000000 Z
+dependencies: []
+description: Pulls Wiki URLs from Excel spreadsheet and automates image downloading.
+email: sarahcwheeler@smail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README.md
+- Gemfile
+- spec/excel_processor_spec.rb
+- spec/image_scraper_spec.rb
+- spec/spec_helper.rb
+- lib/excel_processor.rb
+- lib/image_scraper.rb
+- lib/log2layout.rb
+homepage: http://rubygems.org/gems/log2layout
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.28
+signing_key:
+specification_version: 3
+summary: Scraper for Wikimedia images.
+test_files: []