pismo 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +3 -1
- data/VERSION +1 -1
- data/lib/pismo/internal_attributes.rb +9 -5
- data/pismo.gemspec +3 -3
- data/test/corpus/metadata_expected.yaml +3 -3
- metadata +32 -72
data/README.markdown
CHANGED
@@ -26,7 +26,9 @@ There's also a shorter "convenience" method which might be handy in IRB - it doe
|
|
26
26
|
|
27
27
|
Pismo['http://www.rubyflow.com/items/4082'].title # => "Install Ruby as a non-root User"
|
28
28
|
|
29
|
-
The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
|
29
|
+
The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #html_body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
|
30
|
+
|
31
|
+
The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader" (like Arc90's Readability or Safari Reader) algorithm. #body returns it as plain-text, #html_body maintains some basic HTML styling.
|
30
32
|
|
31
33
|
## CAUTIONS / WARNINGS:
|
32
34
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
@@ -189,12 +189,12 @@ module Pismo
|
|
189
189
|
'.post-text p',
|
190
190
|
'#blogpost p',
|
191
191
|
'.story-teaser',
|
192
|
-
'.subhead',
|
193
192
|
'//div[@class="entrytext"]//p[string-length()>10]', # Ruby Inside / Kubrick style
|
194
193
|
'section p',
|
195
194
|
'.entry .text p',
|
196
195
|
'.entry-content p',
|
197
196
|
'#wikicontent p', # Google Code style
|
197
|
+
'.wikistyle p', # GitHub style
|
198
198
|
'//td[@class="storybody"]/p[string-length()>10]', # BBC News style
|
199
199
|
'//div[@class="entry"]//p[string-length()>100]',
|
200
200
|
# The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
|
@@ -206,16 +206,15 @@ module Pismo
|
|
206
206
|
'#article p',
|
207
207
|
'.post-body',
|
208
208
|
'.entry-content',
|
209
|
-
'.body p',
|
210
209
|
'.document_description_short p', # Scribd
|
211
210
|
'.single-post p'
|
212
211
|
], all)
|
213
212
|
|
214
213
|
# TODO: Improve sentence extraction - this is dire even if it "works for now"
|
215
214
|
if lede && String === lede
|
216
|
-
return lede[/^(.*?[\.\!\?]\s){2}/m] || lede
|
215
|
+
return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
|
217
216
|
elsif lede && Array === lede
|
218
|
-
return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m] || l }.uniq
|
217
|
+
return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
|
219
218
|
else
|
220
219
|
return reader_doc && !reader_doc.sentences(2).empty? ? reader_doc.sentences(2).join(' ') : nil
|
221
220
|
end
|
@@ -268,7 +267,12 @@ module Pismo
|
|
268
267
|
|
269
268
|
# Returns body text as determined by Reader algorithm
|
270
269
|
def body
|
271
|
-
@body ||= reader_doc.content.strip
|
270
|
+
@body ||= reader_doc.content(true).strip
|
271
|
+
end
|
272
|
+
|
273
|
+
# Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
|
274
|
+
def html_body
|
275
|
+
@html_body ||= reader_doc.content.strip
|
272
276
|
end
|
273
277
|
|
274
278
|
# Returns URL to the site's favicon
|
data/pismo.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.6.
|
8
|
+
s.version = "0.6.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
@@ -59,7 +59,7 @@ Gem::Specification.new do |s|
|
|
59
59
|
s.homepage = %q{http://github.com/peterc/pismo}
|
60
60
|
s.rdoc_options = ["--charset=UTF-8"]
|
61
61
|
s.require_paths = ["lib"]
|
62
|
-
s.rubygems_version = %q{1.3.
|
62
|
+
s.rubygems_version = %q{1.3.5}
|
63
63
|
s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
|
64
64
|
s.test_files = [
|
65
65
|
"test/helper.rb",
|
@@ -71,7 +71,7 @@ Gem::Specification.new do |s|
|
|
71
71
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
72
72
|
s.specification_version = 3
|
73
73
|
|
74
|
-
if Gem::Version.new(Gem::
|
74
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
75
75
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
76
76
|
s.add_development_dependency(%q<awesome_print>, [">= 0"])
|
77
77
|
s.add_runtime_dependency(%q<jeweler>, [">= 0"])
|
@@ -31,9 +31,9 @@
|
|
31
31
|
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
32
32
|
:factor:
|
33
33
|
:title: Factor's bootstrap process explained
|
34
|
-
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap.
|
34
|
+
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
|
35
35
|
:ledes:
|
36
|
-
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap.
|
36
|
+
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
|
37
37
|
:youtube:
|
38
38
|
:title: YMO - Rydeen (Official Video)
|
39
39
|
:author: ymo1965
|
@@ -68,6 +68,6 @@
|
|
68
68
|
:sentences: I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself. Of course, there's no reason for us to use C++ features just because we can. The goal is a better compiler for users, not a C++ code base for its own sake.
|
69
69
|
:queness:
|
70
70
|
:title: 18 Incredible CSS3 Effects You Have Never Seen Before
|
71
|
-
:lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web.
|
71
|
+
:lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web."
|
72
72
|
:sentences: CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it.
|
73
73
|
:datetime: 2010-06-02 12:00:00 +01:00
|
metadata
CHANGED
@@ -1,13 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease: false
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 6
|
9
|
-
- 0
|
10
|
-
version: 0.6.0
|
4
|
+
version: 0.6.1
|
11
5
|
platform: ruby
|
12
6
|
authors:
|
13
7
|
- Peter Cooper
|
@@ -20,102 +14,74 @@ default_executable: pismo
|
|
20
14
|
dependencies:
|
21
15
|
- !ruby/object:Gem::Dependency
|
22
16
|
name: shoulda
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
20
|
requirements:
|
27
21
|
- - ">="
|
28
22
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
|
-
segments:
|
31
|
-
- 0
|
32
23
|
version: "0"
|
33
|
-
|
34
|
-
version_requirements: *id001
|
24
|
+
version:
|
35
25
|
- !ruby/object:Gem::Dependency
|
36
26
|
name: awesome_print
|
37
|
-
|
38
|
-
|
39
|
-
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
30
|
requirements:
|
41
31
|
- - ">="
|
42
32
|
- !ruby/object:Gem::Version
|
43
|
-
hash: 3
|
44
|
-
segments:
|
45
|
-
- 0
|
46
33
|
version: "0"
|
47
|
-
|
48
|
-
version_requirements: *id002
|
34
|
+
version:
|
49
35
|
- !ruby/object:Gem::Dependency
|
50
36
|
name: jeweler
|
51
|
-
|
52
|
-
|
53
|
-
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
40
|
requirements:
|
55
41
|
- - ">="
|
56
42
|
- !ruby/object:Gem::Version
|
57
|
-
hash: 3
|
58
|
-
segments:
|
59
|
-
- 0
|
60
43
|
version: "0"
|
61
|
-
|
62
|
-
version_requirements: *id003
|
44
|
+
version:
|
63
45
|
- !ruby/object:Gem::Dependency
|
64
46
|
name: nokogiri
|
65
|
-
|
66
|
-
|
67
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
50
|
requirements:
|
69
51
|
- - ">="
|
70
52
|
- !ruby/object:Gem::Version
|
71
|
-
hash: 3
|
72
|
-
segments:
|
73
|
-
- 0
|
74
53
|
version: "0"
|
75
|
-
|
76
|
-
version_requirements: *id004
|
54
|
+
version:
|
77
55
|
- !ruby/object:Gem::Dependency
|
78
56
|
name: sanitize
|
79
|
-
|
80
|
-
|
81
|
-
|
57
|
+
type: :runtime
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
82
60
|
requirements:
|
83
61
|
- - ">="
|
84
62
|
- !ruby/object:Gem::Version
|
85
|
-
hash: 3
|
86
|
-
segments:
|
87
|
-
- 0
|
88
63
|
version: "0"
|
89
|
-
|
90
|
-
version_requirements: *id005
|
64
|
+
version:
|
91
65
|
- !ruby/object:Gem::Dependency
|
92
66
|
name: fast-stemmer
|
93
|
-
|
94
|
-
|
95
|
-
|
67
|
+
type: :runtime
|
68
|
+
version_requirement:
|
69
|
+
version_requirements: !ruby/object:Gem::Requirement
|
96
70
|
requirements:
|
97
71
|
- - ">="
|
98
72
|
- !ruby/object:Gem::Version
|
99
|
-
hash: 3
|
100
|
-
segments:
|
101
|
-
- 0
|
102
73
|
version: "0"
|
103
|
-
|
104
|
-
version_requirements: *id006
|
74
|
+
version:
|
105
75
|
- !ruby/object:Gem::Dependency
|
106
76
|
name: chronic
|
107
|
-
|
108
|
-
|
109
|
-
|
77
|
+
type: :runtime
|
78
|
+
version_requirement:
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
110
80
|
requirements:
|
111
81
|
- - ">="
|
112
82
|
- !ruby/object:Gem::Version
|
113
|
-
hash: 3
|
114
|
-
segments:
|
115
|
-
- 0
|
116
83
|
version: "0"
|
117
|
-
|
118
|
-
version_requirements: *id007
|
84
|
+
version:
|
119
85
|
description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
|
120
86
|
email: git@peterc.org
|
121
87
|
executables:
|
@@ -172,27 +138,21 @@ rdoc_options:
|
|
172
138
|
require_paths:
|
173
139
|
- lib
|
174
140
|
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
-
none: false
|
176
141
|
requirements:
|
177
142
|
- - ">="
|
178
143
|
- !ruby/object:Gem::Version
|
179
|
-
hash: 3
|
180
|
-
segments:
|
181
|
-
- 0
|
182
144
|
version: "0"
|
145
|
+
version:
|
183
146
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
184
|
-
none: false
|
185
147
|
requirements:
|
186
148
|
- - ">="
|
187
149
|
- !ruby/object:Gem::Version
|
188
|
-
hash: 3
|
189
|
-
segments:
|
190
|
-
- 0
|
191
150
|
version: "0"
|
151
|
+
version:
|
192
152
|
requirements: []
|
193
153
|
|
194
154
|
rubyforge_project:
|
195
|
-
rubygems_version: 1.3.
|
155
|
+
rubygems_version: 1.3.5
|
196
156
|
signing_key:
|
197
157
|
specification_version: 3
|
198
158
|
summary: Extracts or retrieves content-related metadata from HTML pages
|