pismo 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +32 -0
- data/README.rdoc +68 -0
- data/Rakefile +95 -0
- data/VERSION +1 -0
- data/bin/pismo +36 -0
- data/lib/pismo/document.rb +50 -0
- data/lib/pismo/external_attributes.rb +14 -0
- data/lib/pismo/internal_attributes.rb +202 -0
- data/lib/pismo/readability.rb +316 -0
- data/lib/pismo/stopwords.txt +893 -0
- data/lib/pismo.rb +44 -0
- data/pismo.gemspec +92 -0
- data/test/corpus/bbcnews.html +2131 -0
- data/test/corpus/briancray.html +269 -0
- data/test/corpus/cant_read.html +426 -0
- data/test/corpus/factor.html +1362 -0
- data/test/corpus/huffington.html +2932 -0
- data/test/corpus/metadata_expected.yaml +81 -0
- data/test/corpus/rubyinside.html +318 -0
- data/test/corpus/rww.html +1351 -0
- data/test/corpus/spolsky.html +298 -0
- data/test/corpus/techcrunch.html +1285 -0
- data/test/corpus/youtube.html +2348 -0
- data/test/helper.rb +15 -0
- data/test/test_corpus.rb +33 -0
- data/test/test_pismo_document.rb +34 -0
- data/test/test_readability.rb +152 -0
- metadata +146 -0
data/lib/pismo.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fast_stemmer'
|
4
|
+
require 'chronic'
|
5
|
+
|
6
|
+
$: << File.dirname(__FILE__)
|
7
|
+
require 'pismo/document'
|
8
|
+
require 'pismo/readability'
|
9
|
+
|
10
|
+
module Pismo
|
11
|
+
# Sugar method to make creating document objects nicer
|
12
|
+
def self.document(handle, url = nil)
|
13
|
+
Document.new(handle, url)
|
14
|
+
end
|
15
|
+
|
16
|
+
class NFunctions
|
17
|
+
def self.match_href(list, expression)
|
18
|
+
list.find_all { |node| node['href'] =~ /#{expression}/ }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Add some sugar to Nokogiri
|
24
|
+
class Nokogiri::HTML::Document
|
25
|
+
def get_the(search)
|
26
|
+
self.search(search).first rescue nil
|
27
|
+
end
|
28
|
+
|
29
|
+
def match(*queries)
|
30
|
+
queries.each do |query|
|
31
|
+
if query.is_a?(String)
|
32
|
+
result = self.search(query).first.inner_text.strip rescue nil
|
33
|
+
elsif query.is_a?(Array)
|
34
|
+
result = query[1].call(self.search(query.first).first).strip rescue nil
|
35
|
+
end
|
36
|
+
if result
|
37
|
+
result.gsub!(/\342\200\231/, '\'')
|
38
|
+
result.gsub!(/\342\200\224/, '-')
|
39
|
+
return result
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
end
|
data/pismo.gemspec
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{pismo}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Peter Cooper"]
|
12
|
+
s.date = %q{2010-03-26}
|
13
|
+
s.default_executable = %q{pismo}
|
14
|
+
s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, del.icio.us tags, first image used in the content block, etc.}
|
15
|
+
s.email = %q{git@peterc.org}
|
16
|
+
s.executables = ["pismo"]
|
17
|
+
s.extra_rdoc_files = [
|
18
|
+
"LICENSE",
|
19
|
+
"README.rdoc"
|
20
|
+
]
|
21
|
+
s.files = [
|
22
|
+
".document",
|
23
|
+
".gitignore",
|
24
|
+
"LICENSE",
|
25
|
+
"README.rdoc",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"bin/pismo",
|
29
|
+
"lib/pismo.rb",
|
30
|
+
"lib/pismo/document.rb",
|
31
|
+
"lib/pismo/external_attributes.rb",
|
32
|
+
"lib/pismo/internal_attributes.rb",
|
33
|
+
"lib/pismo/readability.rb",
|
34
|
+
"lib/pismo/stopwords.txt",
|
35
|
+
"pismo.gemspec",
|
36
|
+
"test/corpus/bbcnews.html",
|
37
|
+
"test/corpus/briancray.html",
|
38
|
+
"test/corpus/cant_read.html",
|
39
|
+
"test/corpus/factor.html",
|
40
|
+
"test/corpus/huffington.html",
|
41
|
+
"test/corpus/metadata_expected.yaml",
|
42
|
+
"test/corpus/rubyinside.html",
|
43
|
+
"test/corpus/rww.html",
|
44
|
+
"test/corpus/spolsky.html",
|
45
|
+
"test/corpus/techcrunch.html",
|
46
|
+
"test/corpus/youtube.html",
|
47
|
+
"test/helper.rb",
|
48
|
+
"test/test_corpus.rb",
|
49
|
+
"test/test_pismo_document.rb",
|
50
|
+
"test/test_readability.rb"
|
51
|
+
]
|
52
|
+
s.homepage = %q{http://github.com/peterc/pismo}
|
53
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
54
|
+
s.require_paths = ["lib"]
|
55
|
+
s.rubygems_version = %q{1.3.5}
|
56
|
+
s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
|
57
|
+
s.test_files = [
|
58
|
+
"test/helper.rb",
|
59
|
+
"test/test_corpus.rb",
|
60
|
+
"test/test_pismo_document.rb",
|
61
|
+
"test/test_readability.rb"
|
62
|
+
]
|
63
|
+
|
64
|
+
if s.respond_to? :specification_version then
|
65
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
66
|
+
s.specification_version = 3
|
67
|
+
|
68
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
69
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
70
|
+
s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
|
71
|
+
s.add_runtime_dependency(%q<loofah>, [">= 0"])
|
72
|
+
s.add_runtime_dependency(%q<httparty>, [">= 0"])
|
73
|
+
s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
|
74
|
+
s.add_runtime_dependency(%q<chronic>, [">= 0"])
|
75
|
+
else
|
76
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
78
|
+
s.add_dependency(%q<loofah>, [">= 0"])
|
79
|
+
s.add_dependency(%q<httparty>, [">= 0"])
|
80
|
+
s.add_dependency(%q<fast-stemmer>, [">= 0"])
|
81
|
+
s.add_dependency(%q<chronic>, [">= 0"])
|
82
|
+
end
|
83
|
+
else
|
84
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
85
|
+
s.add_dependency(%q<nokogiri>, [">= 0"])
|
86
|
+
s.add_dependency(%q<loofah>, [">= 0"])
|
87
|
+
s.add_dependency(%q<httparty>, [">= 0"])
|
88
|
+
s.add_dependency(%q<fast-stemmer>, [">= 0"])
|
89
|
+
s.add_dependency(%q<chronic>, [">= 0"])
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|