log2layout 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ # gem "rails"
5
+ gem 'mechanize'
6
+ gem 'rspec'
@@ -0,0 +1,38 @@
1
+ # Log2Layout
2
+
3
+ ## Features:
4
+
5
+ + Command-line image scraper based on an Excel spreadsheet template (aka image log)
6
+ + Pulls URLs and figure numbers from Excel spreadsheets, creates "/images" folder in location of image log, downloads image file
7
+ + Automatically renames image files to figure numbers
8
+ + Keeps original image extension (i.e., no mass .jpg renaming)
9
+ + Currently only works with Wikimedia images
10
+ + Known bugs: issues with https access, needs handling for Wiki images without Original File link
11
+
12
+
13
+ ## Log2Layout Recipe
14
+
15
+ 1. Install the Log2Layout gem.
16
+
17
+ gem install log2layout
18
+
19
+ 2. Run the gem with the name of your Excel spreadsheet file as the argument.
20
+
21
+ log2layout c:/Users/You/Desktop/image_log.xlsx
22
+
23
+ 3. Magic happens (sometimes the magic can take a little while--these are high res images, after all), and suddenly an "images" folder appears beside your spreadsheet. Ta-da!
24
+
25
+
26
+ ## Planned Features for When I'm a Better Programmer:
27
+
28
+ + GUI interface (wxRuby)
29
+ + General page scanning: will scan page for a unique <img> tag in the page's HTML and download the available image resources
30
+ + Duplicate handling--if it pulls 2+ images, it will: 1) prefix all images with a DUP_ label, 2) highlight the Excel row in red, 3) create and automatically open a text file log (saved to the /images folder) that provides the row number of the Excel spreadsheet that got multiple images, and a list of the image file names to be compared
31
+ + Inserts a thumbnail of the image into the spreadsheet (creates column)
32
+ + Generates caption text boxes in InDesign with Figure Title paragraph style applied, saved to a /resources folder
33
+
34
+
35
+
36
+
37
+
38
+
@@ -0,0 +1,60 @@
1
+ # Opens Excel Log
2
+
3
+ require 'win32ole'
4
+
5
+ class ExcelProcessor
6
+
7
+ attr_accessor :spreadsheet, :image_log
8
+
9
+ def initialize(spreadsheet, image_log={})
10
+ @spreadsheet = spreadsheet
11
+ @image_log = image_log
12
+ end
13
+
14
+ def read_spreadsheet
15
+ excel = WIN32OLE.new('Excel.Application')
16
+ excel.visible = false
17
+ img_log = excel.Workbooks.Open(@spreadsheet)
18
+ img_ws = img_log.Worksheets(1)
19
+
20
+ # Set-up arrays for column values
21
+ link_array = []
22
+ figure_array = []
23
+
24
+ # Pull values from Figure # column
25
+ for row in 10..img_ws.UsedRange.Rows.Count do
26
+ cell = img_ws.Cells(row,3).value.to_s
27
+ if cell != nil
28
+ c = cell.sub(/(\D*)/, "fig_")
29
+ new_cell = c.sub(/(\.)/, "-")
30
+ else
31
+ new_cell = ""
32
+ end
33
+
34
+ figure_array << new_cell
35
+ end
36
+
37
+ # Pull values from Source URL column
38
+ for row in 10..img_ws.UsedRange.Rows.Count do
39
+ cell = img_ws.Cells(row,4).Value
40
+ if (cell != nil) && (cell.include?("."))
41
+ new_cell = cell
42
+ else
43
+ new_cell = ""
44
+ end
45
+
46
+ link_array << new_cell
47
+ end
48
+
49
+ # Zip arrays into hash of figure numbers and links
50
+ @image_log = Hash[figure_array.zip(link_array)]
51
+
52
+ # Remove blanks
53
+ @image_log.delete_if { |k, v| k == "" }
54
+
55
+ # Shut it down
56
+ excel.ActiveWorkbook.Close(0)
57
+ excel.Quit
58
+ end
59
+
60
+ end
@@ -0,0 +1,43 @@
1
+ require "#{File.dirname(__FILE__)}/./excel_processor"
2
+ require 'mechanize'
3
+ require 'open-uri'
4
+ require 'logger'
5
+
6
+ class ImageScraper
7
+
8
+ attr_accessor :log, :location
9
+
10
+ def initialize log, location
11
+ @log = log
12
+ @location = location
13
+ end
14
+
15
+ def create_dir
16
+ if Dir.exists?("#{File.dirname(@location)}/images")
17
+ puts "Exists!"
18
+ else
19
+ Dir.mkdir("#{File.dirname(@location)}/images")
20
+ end
21
+ end
22
+
23
+ def scrape
24
+ site_seeker = Mechanize.new { |a| a.log = Logger.new("#{File.dirname(@location)}/images/scrape_summary.log") }
25
+ #For Wiki only
26
+ @log.each do |fig, link|
27
+ ext = link.scan(/\..{3,4}$/)
28
+ begin
29
+ page = site_seeker.get(link)
30
+ puts "Found #{fig} at #{link}"
31
+ img_page = page.links_with(:text => "Original file").first.click
32
+ puts "Located original image"
33
+ img = img_page.save!("#{File.dirname(@location)}/images/#{fig}#{ext[0]}")
34
+ puts "#{fig} has been saved."
35
+ rescue
36
+ puts "Uh-oh, something went wrong with #{fig.upcase}"
37
+ next
38
+ end
39
+ end
40
+ end
41
+
42
+ end
43
+
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative './image_scraper.rb'
4
+ require_relative './excel_processor.rb'
5
+
6
+ #TO RUN MANUALLY ON THE COMMAND LINE
7
+
8
+ # excel_path = ARGV[0]
9
+
10
+ # @book = ExcelProcessor.new(excel_path)
11
+ # @book.read_spreadsheet
12
+ # @book.image_log
13
+
14
+ # @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
15
+ # @image_scraper.create_dir
16
+ # @image_scraper.scrape
@@ -0,0 +1,56 @@
1
+ require 'spec_helper.rb'
2
+ require "#{File.dirname(__FILE__)}/../lib/excel_processor"
3
+
4
+ describe ExcelProcessor do
5
+
6
+ before do
7
+ @book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
8
+ end
9
+
10
+ context "#new" do
11
+ it "should have the spreadsheet point to a file that exists" do
12
+ File.exist?(@book.spreadsheet).should be_true
13
+ end
14
+
15
+ it "should create an empty hash for image log" do
16
+ @book.image_log.should be_a_kind_of(Hash)
17
+ @book.image_log.length.should eq 0
18
+ end
19
+ end
20
+
21
+ context "#read_spreadsheet" do
22
+ it "should open Excel and populate image log" do
23
+ @book.read_spreadsheet
24
+ @book.image_log.should_not be_empty
25
+ end
26
+
27
+ it "should start with 'fig_'" do
28
+ @book.read_spreadsheet
29
+ @book.image_log.has_key?("fig_8-8").should be_true
30
+ @book.image_log.has_key?("fig_2").should be_true
31
+ end
32
+
33
+ it "should replace periods between numbers with dashes" do
34
+ @book.read_spreadsheet
35
+ @book.image_log.has_key?("fig_1-1").should be_true
36
+ end
37
+
38
+ it "should match the right figure numbers with the right links" do
39
+ @book.read_spreadsheet
40
+ @book.image_log.fetch("fig_1-1").should eq "http://commons.wikimedia.org/wiki/File:Basketry-covered_lightbulb_01.jpg"
41
+ end
42
+
43
+ it "should remove any pairs with blank values" do
44
+ @book.read_spreadsheet
45
+ @book.image_log.has_key?("").should be_false
46
+ end
47
+
48
+ it "should return an @image_log length of 4" do
49
+ @book.read_spreadsheet
50
+ @book.image_log.length.should eq 4
51
+ end
52
+
53
+ end
54
+
55
+
56
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper.rb'
2
+ require "#{File.dirname(__FILE__)}/../lib/image_scraper"
3
+ require "#{File.dirname(__FILE__)}/../lib/excel_processor"
4
+ require "mechanize"
5
+
6
+ describe ImageScraper do
7
+
8
+ before :all do
9
+ @book = ExcelProcessor.new("#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx")
10
+ @book.read_spreadsheet
11
+ @image_scraper = ImageScraper.new(@book.image_log, @book.spreadsheet)
12
+ end
13
+
14
+ context "#new" do
15
+ it "should do receive @image_log from the Excel Processor object" do
16
+ @image_scraper.log.should be_a_kind_of(Hash)
17
+ @image_scraper.log.should_not be_empty
18
+ end
19
+
20
+ it "should receive the location of the spreadsheet" do
21
+ @image_scraper.location.should == "#{File.dirname(__FILE__)}/../resources/image-log-template.xlsx"
22
+ end
23
+ end
24
+
25
+ context "#create_dir" do
26
+ it "should create a folder called 'images'" do
27
+ @image_scraper.create_dir
28
+ img_dir = File.dirname(@image_scraper.location)
29
+ File.exist?(img_dir).should be_true
30
+ end
31
+ end
32
+
33
+ context "#scrape" do
34
+ it "should handle all errors" do
35
+ @final_log = @image_scraper.scrape
36
+ expect { @final_log }.not_to raise_error
37
+ end
38
+
39
+ it "should save the image file" do
40
+ @final_log
41
+ File.exist?("#{File.dirname(@image_scraper.location)}/images/fig_1-1.jpg").should be_true
42
+ end
43
+
44
+ it "should create a scraping summary log file" do
45
+ File.exist?("#{File.dirname(@image_scraper.location)}/images/scrape_summary.log").should be_true
46
+ end
47
+ end
48
+
49
+ end
@@ -0,0 +1,3 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+
3
+ require 'log2layout.rb'
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: log2layout
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sarah W.
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-15 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Pulls Wiki URLs from Excel spreadsheet and automates image downloading.
15
+ email: sarahcwheeler@smail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - README.md
21
+ - Gemfile
22
+ - spec/excel_processor_spec.rb
23
+ - spec/image_scraper_spec.rb
24
+ - spec/spec_helper.rb
25
+ - lib/excel_processor.rb
26
+ - lib/image_scraper.rb
27
+ - lib/log2layout.rb
28
+ homepage: http://rubygems.org/gems/log2layout
29
+ licenses:
30
+ - MIT
31
+ post_install_message:
32
+ rdoc_options: []
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ requirements: []
48
+ rubyforge_project:
49
+ rubygems_version: 1.8.28
50
+ signing_key:
51
+ specification_version: 3
52
+ summary: Scraper for Wikimedia images.
53
+ test_files: []