plumnailer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2010 Matthew M. Boedicker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,23 @@
1
+ Compare all of the images on an HTML page and select the one that best
2
+ represents what the page is about (for use as a thumbnail for the entire page).
3
+
4
+ Project is in an early stage and for now just picks the largest image on the
5
+ page. More sophisticated scoring coming soon.
6
+
7
+ <pre>
8
+ <code>
9
+ require 'plumnailer'
10
+
11
+ chooser = Plumnailer::Chooser.new
12
+ # caching fetcher is good for testing
13
+ chooser.fetcher = chooser.img_parser.fetcher =
14
+ Plumnailer::CachingFetcher.new('/tmp/plumnailer-test')
15
+
16
+ choice = chooser.choose('http://urls.matthewm.boedicker.org/')
17
+ puts choice.source_url
18
+
19
+ choice.resize_to_fill(64, 64).write('thumb.jpg')
20
+ </code>
21
+ </pre>
22
+
23
+ Questions and comments: "matthewm@boedicker.org":mailto:matthewm@boedicker.org
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = 'plumnailer'
5
+ gemspec.summary = 'Choose the most representative image on an HTML page'
6
+ gemspec.description = 'Choose the most representative image on an HTML page for use as a thumbnail'
7
+ gemspec.email = 'matthewm@boedicker.org'
8
+ gemspec.homepage = 'http://github.com/mmb/plumnailer'
9
+ gemspec.authors = ['Matthew M. Boedicker']
10
+
11
+ %w{
12
+ nokogiri 1.4.3.1
13
+ rmagick 2.13.1
14
+ }.each_slice(2) { |g,v| gemspec.add_dependency(g, ">= #{v}") }
15
+ end
16
+ rescue LoadError
17
+ puts "Jeweler not available. Install it with: gem install jeweler"
18
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,33 @@
1
+ require 'cgi'
2
+ require 'fileutils'
3
+
4
+ require 'plumnailer'
5
+
6
+ module Plumnailer
7
+
8
+ # Fetch the contents of a url and cache result on filesystem.
9
+ class CachingFetcher < Fetcher
10
+
11
+ def initialize(cache_dir)
12
+ @cache_dir = cache_dir
13
+ FileUtils.mkdir_p(cache_dir)
14
+ end
15
+
16
+ # Fetch the contents of a url and cache result on filesystem.
17
+ def fetch(url)
18
+ cache_file = File.join(cache_dir, CGI.escape(url.to_s))
19
+
20
+ if File.exists?(cache_file)
21
+ open(cache_file) { |f| f.read }
22
+ else
23
+ data = super
24
+ open(cache_file, 'w') { |f| f.write(data) }
25
+ data
26
+ end
27
+ end
28
+
29
+ attr_accessor :cache_dir
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,45 @@
1
+ module Plumnailer
2
+
3
+ # Find the most representative image on a page.
4
+ class Chooser
5
+
6
+ def initialize
7
+ @fetcher = Plumnailer::Fetcher.new
8
+ @doc_parser = Plumnailer::DocParser.new
9
+ @img_url_filters = [Plumnailer::ImgHostnameFilter.new]
10
+ @img_parser = Plumnailer::ImgParser.new(fetcher)
11
+ @img_comparator = Plumnailer::ImgComparator
12
+ end
13
+
14
+ # Find the most representative image on a page.
15
+ def choose(url)
16
+ doc_string = fetcher.fetch(url)
17
+
18
+ doc = doc_parser.parse(doc_string, url)
19
+
20
+ img_abs_urls = doc.img_abs_urls.dup
21
+ img_url_filters.each do |filter|
22
+ img_abs_urls.delete_if { |i| filter.reject?(i) }
23
+ end
24
+
25
+ imgs = img_parser.parse(img_abs_urls)
26
+
27
+ unless imgs.empty?
28
+ imgs.each do |img|
29
+ # set source document on image so it can be used in comparator
30
+ img.doc = doc
31
+ img.extend(@img_comparator)
32
+ end
33
+ imgs.sort.first
34
+ end
35
+ end
36
+
37
+ attr_accessor :fetcher
38
+ attr_accessor :doc_parser
39
+ attr_accessor :img_url_filters
40
+ attr_accessor :img_parser
41
+ attr_accessor :img_comparator
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,32 @@
1
+ require 'uri'
2
+
3
+ module Plumnailer
4
+
5
+ # Nokogiri::HTML:Document mixin.
6
+ module Doc
7
+
8
+ # Return a list of the src attributes of all img tags.
9
+ def img_srcs
10
+ search('//img').map { |x| x['src'] }.compact
11
+ end
12
+
13
+ # Return a list of the absolute urls of all imgs in the document.
14
+ def img_abs_urls(base_url=nil)
15
+ result = []
16
+
17
+ img_srcs.each do |i|
18
+ begin
19
+ u = URI(i)
20
+ rescue URI::InvalidURIError
21
+ next
22
+ end
23
+ result.push(u.is_a?(URI::HTTP) ? u : URI.join(base_url || source_url, i))
24
+ end
25
+
26
+ result
27
+ end
28
+
29
+ attr_accessor :source_url
30
+ end
31
+
32
+ end
@@ -0,0 +1,19 @@
1
+ require 'nokogiri'
2
+
3
+ require 'plumnailer'
4
+
5
+ module Plumnailer
6
+
7
+ # Parse an HTML document.
8
+ class DocParser
9
+
10
+ # Parse an HTML document.
11
+ def parse(doc_string, source_url=nil)
12
+ doc = Nokogiri::HTML(doc_string).extend(Plumnailer::Doc)
13
+ doc.source_url = source_url
14
+ doc
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -0,0 +1,16 @@
1
+ require 'open-uri'
2
+
3
+ module Plumnailer
4
+
5
+ # Fetch the contents of a url.
6
+ class Fetcher
7
+
8
+ # Fetch the contents of a url.
9
+ def fetch(url)
10
+ uri = url.is_a?(URI) ? url : URI(url)
11
+ open(uri) { |f| f.read } if uri.is_a?(URI::HTTP)
12
+ end
13
+
14
+ end
15
+
16
+ end
@@ -0,0 +1,13 @@
1
+ module Plumnailer
2
+
3
+ # Mixin for Magick::Image that adds a comparator for sorting by relevance.
4
+ module ImgComparator
5
+
6
+ def <=> other
7
+ # can use doc here to take source page into account
8
+ other.rows * other.columns <=> rows * columns
9
+ end
10
+
11
+ end
12
+
13
+ end
@@ -0,0 +1,21 @@
1
+ module Plumnailer
2
+
3
+ # Decide whether to process images based on their url hostname.
4
+ class ImgHostnameFilter
5
+
6
+ # Return true if this image url should not be considered.
7
+ def reject?(img_url)
8
+ HostnameRejectPatterns.each do |re|
9
+ return true if img_url.host and img_url.host[re]
10
+ end
11
+ false
12
+ end
13
+
14
+ HostnameRejectPatterns = [
15
+ %r{^ad\.doubleclick\.net$},
16
+ %r{^b\.scorecardresearch\.com$},
17
+ ]
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,39 @@
1
+ require 'RMagick'
2
+
3
+ require 'plumnailer'
4
+
5
+ module Plumnailer
6
+
7
+ # Fetch image data urls, load into Magick::Image and set some additional
8
+ # fields.
9
+ class ImgParser
10
+
11
+ def initialize(fetcher)
12
+ @fetcher = fetcher
13
+ end
14
+
15
+ # Parse image data from one or more urls.
16
+ def parse(img_urls)
17
+ if img_urls.respond_to? :map
18
+ img_urls.map { |i| parse_one i }.compact
19
+ else
20
+ parse_one i
21
+ end
22
+ end
23
+
24
+ # Fetch image data from a url, load into Magick::Image and set some
25
+ # additional fields.
26
+ def parse_one(img_url)
27
+ img_data = fetcher.fetch(img_url)
28
+ unless !img_data or img_data.empty?
29
+ img = Magick::ImageList.new.from_blob(img_data).first.extend(
30
+ Plumnailer::WebImage)
31
+ img.source_url = img_url
32
+ img
33
+ end
34
+ end
35
+
36
+ attr_accessor :fetcher
37
+ end
38
+
39
+ end
@@ -0,0 +1,9 @@
1
+ module Plumnailer
2
+
3
+ # Mixin for Magick::Image that adds url field and source document.
4
+ module WebImage
5
+ attr_accessor :source_url
6
+ attr_accessor :doc
7
+ end
8
+
9
+ end
data/lib/plumnailer.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'plumnailer/chooser'
2
+ require 'plumnailer/doc_parser'
3
+ require 'plumnailer/doc'
4
+ require 'plumnailer/fetcher'
5
+ require 'plumnailer/img_comparator'
6
+ require 'plumnailer/img_hostname_filter'
7
+ require 'plumnailer/img_parser'
8
+ require 'plumnailer/web_image'
9
+
10
+ require 'plumnailer/caching_fetcher'
@@ -0,0 +1,58 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{plumnailer}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Matthew M. Boedicker"]
12
+ s.date = %q{2010-11-09}
13
+ s.description = %q{Choose the most representative image on an HTML page for use as a thumbnail}
14
+ s.email = %q{matthewm@boedicker.org}
15
+ s.extra_rdoc_files = [
16
+ "README.textile"
17
+ ]
18
+ s.files = [
19
+ "COPYING",
20
+ "README.textile",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "lib/plumnailer.rb",
24
+ "lib/plumnailer/caching_fetcher.rb",
25
+ "lib/plumnailer/chooser.rb",
26
+ "lib/plumnailer/doc.rb",
27
+ "lib/plumnailer/doc_parser.rb",
28
+ "lib/plumnailer/fetcher.rb",
29
+ "lib/plumnailer/img_comparator.rb",
30
+ "lib/plumnailer/img_hostname_filter.rb",
31
+ "lib/plumnailer/img_parser.rb",
32
+ "lib/plumnailer/web_image.rb",
33
+ "plumnailer.gemspec",
34
+ "test.rb"
35
+ ]
36
+ s.homepage = %q{http://github.com/mmb/plumnailer}
37
+ s.rdoc_options = ["--charset=UTF-8"]
38
+ s.require_paths = ["lib"]
39
+ s.rubygems_version = %q{1.3.7}
40
+ s.summary = %q{Choose the most representative image on an HTML page}
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
47
+ s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.3.1"])
48
+ s.add_runtime_dependency(%q<rmagick>, [">= 2.13.1"])
49
+ else
50
+ s.add_dependency(%q<nokogiri>, [">= 1.4.3.1"])
51
+ s.add_dependency(%q<rmagick>, [">= 2.13.1"])
52
+ end
53
+ else
54
+ s.add_dependency(%q<nokogiri>, [">= 1.4.3.1"])
55
+ s.add_dependency(%q<rmagick>, [">= 2.13.1"])
56
+ end
57
+ end
58
+
data/test.rb ADDED
@@ -0,0 +1,36 @@
1
+ $:.unshift(File.join(File.dirname(__FILE__), 'lib'))
2
+
3
+ require 'sinatra'
4
+
5
+ require 'plumnailer'
6
+
7
+ # sinatra web application for testing
8
+
9
+ get '/' do
10
+ page_link = img = ''
11
+ url = params[:url]
12
+
13
+ if url
14
+ chooser = Plumnailer::Chooser.new
15
+ chooser.fetcher = chooser.img_parser.fetcher =
16
+ Plumnailer::CachingFetcher.new('/tmp/plumnailer-test')
17
+ if choice = chooser.choose(url)
18
+ img_url = choice.source_url
19
+ page_link = "<p><a href=\"#{url}\">#{url}</a></p>"
20
+ img = "<p><img src=\"#{img_url}\" /></p>"
21
+ else
22
+ img = '<p>No images found</p>'
23
+ end
24
+ end
25
+
26
+ <<-eos
27
+ <form method="get" action="">
28
+ <input type="text" name="url" size="50" />
29
+ <input type="submit" />
30
+ </form>
31
+
32
+ #{page_link}
33
+
34
+ #{img}
35
+ eos
36
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: plumnailer
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Matthew M. Boedicker
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-09 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: nokogiri
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 113
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 3
34
+ - 1
35
+ version: 1.4.3.1
36
+ type: :runtime
37
+ version_requirements: *id001
38
+ - !ruby/object:Gem::Dependency
39
+ name: rmagick
40
+ prerelease: false
41
+ requirement: &id002 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ hash: 57
47
+ segments:
48
+ - 2
49
+ - 13
50
+ - 1
51
+ version: 2.13.1
52
+ type: :runtime
53
+ version_requirements: *id002
54
+ description: Choose the most representative image on an HTML page for use as a thumbnail
55
+ email: matthewm@boedicker.org
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ extra_rdoc_files:
61
+ - README.textile
62
+ files:
63
+ - COPYING
64
+ - README.textile
65
+ - Rakefile
66
+ - VERSION
67
+ - lib/plumnailer.rb
68
+ - lib/plumnailer/caching_fetcher.rb
69
+ - lib/plumnailer/chooser.rb
70
+ - lib/plumnailer/doc.rb
71
+ - lib/plumnailer/doc_parser.rb
72
+ - lib/plumnailer/fetcher.rb
73
+ - lib/plumnailer/img_comparator.rb
74
+ - lib/plumnailer/img_hostname_filter.rb
75
+ - lib/plumnailer/img_parser.rb
76
+ - lib/plumnailer/web_image.rb
77
+ - plumnailer.gemspec
78
+ - test.rb
79
+ has_rdoc: true
80
+ homepage: http://github.com/mmb/plumnailer
81
+ licenses: []
82
+
83
+ post_install_message:
84
+ rdoc_options:
85
+ - --charset=UTF-8
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ hash: 3
94
+ segments:
95
+ - 0
96
+ version: "0"
97
+ required_rubygems_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ hash: 3
103
+ segments:
104
+ - 0
105
+ version: "0"
106
+ requirements: []
107
+
108
+ rubyforge_project:
109
+ rubygems_version: 1.3.7
110
+ signing_key:
111
+ specification_version: 3
112
+ summary: Choose the most representative image on an HTML page
113
+ test_files: []
114
+