plumnailer 0.0.4 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/plumnailer/caching_fetcher.rb +1 -1
- data/lib/plumnailer/chooser.rb +2 -2
- data/lib/plumnailer/doc.rb +12 -1
- data/lib/plumnailer/img_parser.rb +2 -2
- data/lib/plumnailer/img_url_filter.rb +23 -0
- data/lib/plumnailer.rb +1 -1
- data/plumnailer.gemspec +18 -19
- metadata +7 -7
- data/lib/plumnailer/img_hostname_filter.rb +0 -22
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/lib/plumnailer/chooser.rb
CHANGED
@@ -6,7 +6,7 @@ module Plumnailer
|
|
6
6
|
def initialize
|
7
7
|
@fetcher = Plumnailer::Fetcher.new
|
8
8
|
@doc_parser = Plumnailer::DocParser.new
|
9
|
-
@img_url_filters = [Plumnailer::
|
9
|
+
@img_url_filters = [Plumnailer::ImgUrlFilter.new]
|
10
10
|
@img_parser = Plumnailer::ImgParser.new(fetcher)
|
11
11
|
@img_comparator = Plumnailer::ImgComparator
|
12
12
|
end
|
@@ -28,7 +28,7 @@ module Plumnailer
|
|
28
28
|
imgs.each do |img|
|
29
29
|
# set source document on image so it can be used in comparator
|
30
30
|
img.doc = doc
|
31
|
-
img.extend
|
31
|
+
img.extend @img_comparator
|
32
32
|
end
|
33
33
|
imgs.sort.first
|
34
34
|
end
|
data/lib/plumnailer/doc.rb
CHANGED
@@ -5,6 +5,12 @@ module Plumnailer
|
|
5
5
|
# Nokogiri::HTML:Document mixin.
|
6
6
|
module Doc
|
7
7
|
|
8
|
+
# Get the href attribute of the base tag from the head of the document.
|
9
|
+
def doc_base_href
|
10
|
+
base = at('//head/base')
|
11
|
+
base['href'] if base
|
12
|
+
end
|
13
|
+
|
8
14
|
# Return a list of the src attributes of all img tags.
|
9
15
|
def img_srcs
|
10
16
|
search('//img').map { |x| x['src'] }.compact
|
@@ -20,7 +26,12 @@ module Plumnailer
|
|
20
26
|
rescue URI::InvalidURIError
|
21
27
|
next
|
22
28
|
end
|
23
|
-
|
29
|
+
|
30
|
+
result << if u.is_a?(URI::HTTP)
|
31
|
+
u
|
32
|
+
else
|
33
|
+
URI.join(base_url || doc_base_href || source_url, i)
|
34
|
+
end
|
24
35
|
end
|
25
36
|
|
26
37
|
result
|
@@ -14,7 +14,7 @@ module Plumnailer
|
|
14
14
|
|
15
15
|
# Parse image data from one or more urls.
|
16
16
|
def parse(img_urls)
|
17
|
-
if img_urls.respond_to?
|
17
|
+
if img_urls.respond_to?(:inject)
|
18
18
|
cache = {}
|
19
19
|
img_urls.inject([]) do |memo,u|
|
20
20
|
# nil values should be cached
|
@@ -29,7 +29,7 @@ module Plumnailer
|
|
29
29
|
# additional fields.
|
30
30
|
def parse_one(img_url)
|
31
31
|
img_data = fetcher.fetch(img_url)
|
32
|
-
unless
|
32
|
+
unless not img_data or img_data.empty?
|
33
33
|
img = Magick::ImageList.new.from_blob(img_data).extend(
|
34
34
|
Plumnailer::WebImage)
|
35
35
|
img.source_url = img_url
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Plumnailer
|
2
|
+
|
3
|
+
# Decide whether to process images based on their url.
|
4
|
+
class ImgUrlFilter
|
5
|
+
|
6
|
+
# Return true if this image url should not be considered.
|
7
|
+
def reject?(img_url)
|
8
|
+
ImgUrlPatterns.each do |re|
|
9
|
+
return true if img_url and img_url.to_s[re]
|
10
|
+
end
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
ImgUrlPatterns = [
|
15
|
+
%r{^http://ad\.doubleclick\.net/},
|
16
|
+
%r{^http://b\.scorecardresearch\.com/},
|
17
|
+
%r{^http://pixel\.quantserve\.com/},
|
18
|
+
%r{^http://s7\.addthis\.com/},
|
19
|
+
]
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/lib/plumnailer.rb
CHANGED
@@ -3,8 +3,8 @@ require 'plumnailer/doc_parser'
|
|
3
3
|
require 'plumnailer/doc'
|
4
4
|
require 'plumnailer/fetcher'
|
5
5
|
require 'plumnailer/img_comparator'
|
6
|
-
require 'plumnailer/img_hostname_filter'
|
7
6
|
require 'plumnailer/img_parser'
|
7
|
+
require 'plumnailer/img_url_filter'
|
8
8
|
require 'plumnailer/web_image'
|
9
9
|
|
10
10
|
require 'plumnailer/caching_fetcher'
|
data/plumnailer.gemspec
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# Generated by jeweler
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{plumnailer}
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "0.1.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Matthew M. Boedicker"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-21}
|
13
13
|
s.description = %q{Choose the most representative image on an HTML page for use as a thumbnail}
|
14
14
|
s.email = %q{matthewm@boedicker.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -17,24 +17,23 @@ Gem::Specification.new do |s|
|
|
17
17
|
]
|
18
18
|
s.files = [
|
19
19
|
"COPYING",
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
20
|
+
"README.textile",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"lib/plumnailer.rb",
|
24
|
+
"lib/plumnailer/caching_fetcher.rb",
|
25
|
+
"lib/plumnailer/chooser.rb",
|
26
|
+
"lib/plumnailer/doc.rb",
|
27
|
+
"lib/plumnailer/doc_parser.rb",
|
28
|
+
"lib/plumnailer/fetcher.rb",
|
29
|
+
"lib/plumnailer/img_comparator.rb",
|
30
|
+
"lib/plumnailer/img_parser.rb",
|
31
|
+
"lib/plumnailer/img_url_filter.rb",
|
32
|
+
"lib/plumnailer/web_image.rb",
|
33
|
+
"plumnailer.gemspec",
|
34
|
+
"test.rb"
|
35
35
|
]
|
36
36
|
s.homepage = %q{http://github.com/mmb/plumnailer}
|
37
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
38
37
|
s.require_paths = ["lib"]
|
39
38
|
s.rubygems_version = %q{1.3.7}
|
40
39
|
s.summary = %q{Choose the most representative image on an HTML page}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plumnailer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 0.0.4
|
10
|
+
version: 0.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew M. Boedicker
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-21 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -71,8 +71,8 @@ files:
|
|
71
71
|
- lib/plumnailer/doc_parser.rb
|
72
72
|
- lib/plumnailer/fetcher.rb
|
73
73
|
- lib/plumnailer/img_comparator.rb
|
74
|
-
- lib/plumnailer/img_hostname_filter.rb
|
75
74
|
- lib/plumnailer/img_parser.rb
|
75
|
+
- lib/plumnailer/img_url_filter.rb
|
76
76
|
- lib/plumnailer/web_image.rb
|
77
77
|
- plumnailer.gemspec
|
78
78
|
- test.rb
|
@@ -81,8 +81,8 @@ homepage: http://github.com/mmb/plumnailer
|
|
81
81
|
licenses: []
|
82
82
|
|
83
83
|
post_install_message:
|
84
|
-
rdoc_options:
|
85
|
-
|
84
|
+
rdoc_options: []
|
85
|
+
|
86
86
|
require_paths:
|
87
87
|
- lib
|
88
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Plumnailer
|
2
|
-
|
3
|
-
# Decide whether to process images based on their url hostname.
|
4
|
-
class ImgHostnameFilter
|
5
|
-
|
6
|
-
# Return true if this image url should not be considered.
|
7
|
-
def reject?(img_url)
|
8
|
-
HostnameRejectPatterns.each do |re|
|
9
|
-
return true if img_url.host and img_url.host[re]
|
10
|
-
end
|
11
|
-
false
|
12
|
-
end
|
13
|
-
|
14
|
-
HostnameRejectPatterns = [
|
15
|
-
%r{^ad\.doubleclick\.net$},
|
16
|
-
%r{^b\.scorecardresearch\.com$},
|
17
|
-
%r{^pixel\.quantserve\.com$},
|
18
|
-
]
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|