plumnailer 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/plumnailer/caching_fetcher.rb +1 -1
- data/lib/plumnailer/chooser.rb +2 -2
- data/lib/plumnailer/doc.rb +12 -1
- data/lib/plumnailer/img_parser.rb +2 -2
- data/lib/plumnailer/img_url_filter.rb +23 -0
- data/lib/plumnailer.rb +1 -1
- data/plumnailer.gemspec +18 -19
- metadata +7 -7
- data/lib/plumnailer/img_hostname_filter.rb +0 -22
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/lib/plumnailer/chooser.rb
CHANGED
@@ -6,7 +6,7 @@ module Plumnailer
|
|
6
6
|
def initialize
|
7
7
|
@fetcher = Plumnailer::Fetcher.new
|
8
8
|
@doc_parser = Plumnailer::DocParser.new
|
9
|
-
@img_url_filters = [Plumnailer::
|
9
|
+
@img_url_filters = [Plumnailer::ImgUrlFilter.new]
|
10
10
|
@img_parser = Plumnailer::ImgParser.new(fetcher)
|
11
11
|
@img_comparator = Plumnailer::ImgComparator
|
12
12
|
end
|
@@ -28,7 +28,7 @@ module Plumnailer
|
|
28
28
|
imgs.each do |img|
|
29
29
|
# set source document on image so it can be used in comparator
|
30
30
|
img.doc = doc
|
31
|
-
img.extend
|
31
|
+
img.extend @img_comparator
|
32
32
|
end
|
33
33
|
imgs.sort.first
|
34
34
|
end
|
data/lib/plumnailer/doc.rb
CHANGED
@@ -5,6 +5,12 @@ module Plumnailer
|
|
5
5
|
# Nokogiri::HTML:Document mixin.
|
6
6
|
module Doc
|
7
7
|
|
8
|
+
# Get the href attribute of the base tag from the head of the document.
|
9
|
+
def doc_base_href
|
10
|
+
base = at('//head/base')
|
11
|
+
base['href'] if base
|
12
|
+
end
|
13
|
+
|
8
14
|
# Return a list of the src attributes of all img tags.
|
9
15
|
def img_srcs
|
10
16
|
search('//img').map { |x| x['src'] }.compact
|
@@ -20,7 +26,12 @@ module Plumnailer
|
|
20
26
|
rescue URI::InvalidURIError
|
21
27
|
next
|
22
28
|
end
|
23
|
-
|
29
|
+
|
30
|
+
result << if u.is_a?(URI::HTTP)
|
31
|
+
u
|
32
|
+
else
|
33
|
+
URI.join(base_url || doc_base_href || source_url, i)
|
34
|
+
end
|
24
35
|
end
|
25
36
|
|
26
37
|
result
|
@@ -14,7 +14,7 @@ module Plumnailer
|
|
14
14
|
|
15
15
|
# Parse image data from one or more urls.
|
16
16
|
def parse(img_urls)
|
17
|
-
if img_urls.respond_to?
|
17
|
+
if img_urls.respond_to?(:inject)
|
18
18
|
cache = {}
|
19
19
|
img_urls.inject([]) do |memo,u|
|
20
20
|
# nil values should be cached
|
@@ -29,7 +29,7 @@ module Plumnailer
|
|
29
29
|
# additional fields.
|
30
30
|
def parse_one(img_url)
|
31
31
|
img_data = fetcher.fetch(img_url)
|
32
|
-
unless
|
32
|
+
unless not img_data or img_data.empty?
|
33
33
|
img = Magick::ImageList.new.from_blob(img_data).extend(
|
34
34
|
Plumnailer::WebImage)
|
35
35
|
img.source_url = img_url
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Plumnailer
|
2
|
+
|
3
|
+
# Decide whether to process images based on their url.
|
4
|
+
class ImgUrlFilter
|
5
|
+
|
6
|
+
# Return true if this image url should not be considered.
|
7
|
+
def reject?(img_url)
|
8
|
+
ImgUrlPatterns.each do |re|
|
9
|
+
return true if img_url and img_url.to_s[re]
|
10
|
+
end
|
11
|
+
false
|
12
|
+
end
|
13
|
+
|
14
|
+
ImgUrlPatterns = [
|
15
|
+
%r{^http://ad\.doubleclick\.net/},
|
16
|
+
%r{^http://b\.scorecardresearch\.com/},
|
17
|
+
%r{^http://pixel\.quantserve\.com/},
|
18
|
+
%r{^http://s7\.addthis\.com/},
|
19
|
+
]
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
data/lib/plumnailer.rb
CHANGED
@@ -3,8 +3,8 @@ require 'plumnailer/doc_parser'
|
|
3
3
|
require 'plumnailer/doc'
|
4
4
|
require 'plumnailer/fetcher'
|
5
5
|
require 'plumnailer/img_comparator'
|
6
|
-
require 'plumnailer/img_hostname_filter'
|
7
6
|
require 'plumnailer/img_parser'
|
7
|
+
require 'plumnailer/img_url_filter'
|
8
8
|
require 'plumnailer/web_image'
|
9
9
|
|
10
10
|
require 'plumnailer/caching_fetcher'
|
data/plumnailer.gemspec
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
# Generated by jeweler
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{plumnailer}
|
8
|
-
s.version = "0.0
|
8
|
+
s.version = "0.1.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Matthew M. Boedicker"]
|
12
|
-
s.date = %q{2010-11-
|
12
|
+
s.date = %q{2010-11-21}
|
13
13
|
s.description = %q{Choose the most representative image on an HTML page for use as a thumbnail}
|
14
14
|
s.email = %q{matthewm@boedicker.org}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -17,24 +17,23 @@ Gem::Specification.new do |s|
|
|
17
17
|
]
|
18
18
|
s.files = [
|
19
19
|
"COPYING",
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
20
|
+
"README.textile",
|
21
|
+
"Rakefile",
|
22
|
+
"VERSION",
|
23
|
+
"lib/plumnailer.rb",
|
24
|
+
"lib/plumnailer/caching_fetcher.rb",
|
25
|
+
"lib/plumnailer/chooser.rb",
|
26
|
+
"lib/plumnailer/doc.rb",
|
27
|
+
"lib/plumnailer/doc_parser.rb",
|
28
|
+
"lib/plumnailer/fetcher.rb",
|
29
|
+
"lib/plumnailer/img_comparator.rb",
|
30
|
+
"lib/plumnailer/img_parser.rb",
|
31
|
+
"lib/plumnailer/img_url_filter.rb",
|
32
|
+
"lib/plumnailer/web_image.rb",
|
33
|
+
"plumnailer.gemspec",
|
34
|
+
"test.rb"
|
35
35
|
]
|
36
36
|
s.homepage = %q{http://github.com/mmb/plumnailer}
|
37
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
38
37
|
s.require_paths = ["lib"]
|
39
38
|
s.rubygems_version = %q{1.3.7}
|
40
39
|
s.summary = %q{Choose the most representative image on an HTML page}
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: plumnailer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 1
|
8
9
|
- 0
|
9
|
-
|
10
|
-
version: 0.0.4
|
10
|
+
version: 0.1.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Matthew M. Boedicker
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-21 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -71,8 +71,8 @@ files:
|
|
71
71
|
- lib/plumnailer/doc_parser.rb
|
72
72
|
- lib/plumnailer/fetcher.rb
|
73
73
|
- lib/plumnailer/img_comparator.rb
|
74
|
-
- lib/plumnailer/img_hostname_filter.rb
|
75
74
|
- lib/plumnailer/img_parser.rb
|
75
|
+
- lib/plumnailer/img_url_filter.rb
|
76
76
|
- lib/plumnailer/web_image.rb
|
77
77
|
- plumnailer.gemspec
|
78
78
|
- test.rb
|
@@ -81,8 +81,8 @@ homepage: http://github.com/mmb/plumnailer
|
|
81
81
|
licenses: []
|
82
82
|
|
83
83
|
post_install_message:
|
84
|
-
rdoc_options:
|
85
|
-
|
84
|
+
rdoc_options: []
|
85
|
+
|
86
86
|
require_paths:
|
87
87
|
- lib
|
88
88
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -1,22 +0,0 @@
|
|
1
|
-
module Plumnailer
|
2
|
-
|
3
|
-
# Decide whether to process images based on their url hostname.
|
4
|
-
class ImgHostnameFilter
|
5
|
-
|
6
|
-
# Return true if this image url should not be considered.
|
7
|
-
def reject?(img_url)
|
8
|
-
HostnameRejectPatterns.each do |re|
|
9
|
-
return true if img_url.host and img_url.host[re]
|
10
|
-
end
|
11
|
-
false
|
12
|
-
end
|
13
|
-
|
14
|
-
HostnameRejectPatterns = [
|
15
|
-
%r{^ad\.doubleclick\.net$},
|
16
|
-
%r{^b\.scorecardresearch\.com$},
|
17
|
-
%r{^pixel\.quantserve\.com$},
|
18
|
-
]
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|