pismo 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +3 -1
- data/VERSION +1 -1
- data/lib/pismo/internal_attributes.rb +9 -5
- data/pismo.gemspec +3 -3
- data/test/corpus/metadata_expected.yaml +3 -3
- metadata +32 -72
data/README.markdown
CHANGED
@@ -26,7 +26,9 @@ There's also a shorter "convenience" method which might be handy in IRB - it doe
|
|
26
26
|
|
27
27
|
Pismo['http://www.rubyflow.com/items/4082'].title # => "Install Ruby as a non-root User"
|
28
28
|
|
29
|
-
The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
|
29
|
+
The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #html_body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
|
30
|
+
|
31
|
+
The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader" (like Arc90's Readability or Safari Reader) algorithm. #body returns it as plain-text, #html_body maintains some basic HTML styling.
|
30
32
|
|
31
33
|
## CAUTIONS / WARNINGS:
|
32
34
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.6.
|
1
|
+
0.6.1
|
@@ -189,12 +189,12 @@ module Pismo
|
|
189
189
|
'.post-text p',
|
190
190
|
'#blogpost p',
|
191
191
|
'.story-teaser',
|
192
|
-
'.subhead',
|
193
192
|
'//div[@class="entrytext"]//p[string-length()>10]', # Ruby Inside / Kubrick style
|
194
193
|
'section p',
|
195
194
|
'.entry .text p',
|
196
195
|
'.entry-content p',
|
197
196
|
'#wikicontent p', # Google Code style
|
197
|
+
'.wikistyle p', # GitHub style
|
198
198
|
'//td[@class="storybody"]/p[string-length()>10]', # BBC News style
|
199
199
|
'//div[@class="entry"]//p[string-length()>100]',
|
200
200
|
# The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
|
@@ -206,16 +206,15 @@ module Pismo
|
|
206
206
|
'#article p',
|
207
207
|
'.post-body',
|
208
208
|
'.entry-content',
|
209
|
-
'.body p',
|
210
209
|
'.document_description_short p', # Scribd
|
211
210
|
'.single-post p'
|
212
211
|
], all)
|
213
212
|
|
214
213
|
# TODO: Improve sentence extraction - this is dire even if it "works for now"
|
215
214
|
if lede && String === lede
|
216
|
-
return lede[/^(.*?[\.\!\?]\s){2}/m] || lede
|
215
|
+
return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
|
217
216
|
elsif lede && Array === lede
|
218
|
-
return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m] || l }.uniq
|
217
|
+
return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
|
219
218
|
else
|
220
219
|
return reader_doc && !reader_doc.sentences(2).empty? ? reader_doc.sentences(2).join(' ') : nil
|
221
220
|
end
|
@@ -268,7 +267,12 @@ module Pismo
|
|
268
267
|
|
269
268
|
# Returns body text as determined by Reader algorithm
|
270
269
|
def body
|
271
|
-
@body ||= reader_doc.content.strip
|
270
|
+
@body ||= reader_doc.content(true).strip
|
271
|
+
end
|
272
|
+
|
273
|
+
# Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
|
274
|
+
def html_body
|
275
|
+
@html_body ||= reader_doc.content.strip
|
272
276
|
end
|
273
277
|
|
274
278
|
# Returns URL to the site's favicon
|
data/pismo.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.6.
|
8
|
+
s.version = "0.6.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
@@ -59,7 +59,7 @@ Gem::Specification.new do |s|
|
|
59
59
|
s.homepage = %q{http://github.com/peterc/pismo}
|
60
60
|
s.rdoc_options = ["--charset=UTF-8"]
|
61
61
|
s.require_paths = ["lib"]
|
62
|
-
s.rubygems_version = %q{1.3.
|
62
|
+
s.rubygems_version = %q{1.3.5}
|
63
63
|
s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
|
64
64
|
s.test_files = [
|
65
65
|
"test/helper.rb",
|
@@ -71,7 +71,7 @@ Gem::Specification.new do |s|
|
|
71
71
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
72
72
|
s.specification_version = 3
|
73
73
|
|
74
|
-
if Gem::Version.new(Gem::
|
74
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
75
75
|
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
76
76
|
s.add_development_dependency(%q<awesome_print>, [">= 0"])
|
77
77
|
s.add_runtime_dependency(%q<jeweler>, [">= 0"])
|
@@ -31,9 +31,9 @@
|
|
31
31
|
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
32
32
|
:factor:
|
33
33
|
:title: Factor's bootstrap process explained
|
34
|
-
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap.
|
34
|
+
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
|
35
35
|
:ledes:
|
36
|
-
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap.
|
36
|
+
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
|
37
37
|
:youtube:
|
38
38
|
:title: YMO - Rydeen (Official Video)
|
39
39
|
:author: ymo1965
|
@@ -68,6 +68,6 @@
|
|
68
68
|
:sentences: I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself. Of course, there's no reason for us to use C++ features just because we can. The goal is a better compiler for users, not a C++ code base for its own sake.
|
69
69
|
:queness:
|
70
70
|
:title: 18 Incredible CSS3 Effects You Have Never Seen Before
|
71
|
-
:lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web.
|
71
|
+
:lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web."
|
72
72
|
:sentences: CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it.
|
73
73
|
:datetime: 2010-06-02 12:00:00 +01:00
|
metadata
CHANGED
@@ -1,13 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease: false
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 6
|
9
|
-
- 0
|
10
|
-
version: 0.6.0
|
4
|
+
version: 0.6.1
|
11
5
|
platform: ruby
|
12
6
|
authors:
|
13
7
|
- Peter Cooper
|
@@ -20,102 +14,74 @@ default_executable: pismo
|
|
20
14
|
dependencies:
|
21
15
|
- !ruby/object:Gem::Dependency
|
22
16
|
name: shoulda
|
23
|
-
|
24
|
-
|
25
|
-
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
20
|
requirements:
|
27
21
|
- - ">="
|
28
22
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
|
-
segments:
|
31
|
-
- 0
|
32
23
|
version: "0"
|
33
|
-
|
34
|
-
version_requirements: *id001
|
24
|
+
version:
|
35
25
|
- !ruby/object:Gem::Dependency
|
36
26
|
name: awesome_print
|
37
|
-
|
38
|
-
|
39
|
-
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
30
|
requirements:
|
41
31
|
- - ">="
|
42
32
|
- !ruby/object:Gem::Version
|
43
|
-
hash: 3
|
44
|
-
segments:
|
45
|
-
- 0
|
46
33
|
version: "0"
|
47
|
-
|
48
|
-
version_requirements: *id002
|
34
|
+
version:
|
49
35
|
- !ruby/object:Gem::Dependency
|
50
36
|
name: jeweler
|
51
|
-
|
52
|
-
|
53
|
-
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
40
|
requirements:
|
55
41
|
- - ">="
|
56
42
|
- !ruby/object:Gem::Version
|
57
|
-
hash: 3
|
58
|
-
segments:
|
59
|
-
- 0
|
60
43
|
version: "0"
|
61
|
-
|
62
|
-
version_requirements: *id003
|
44
|
+
version:
|
63
45
|
- !ruby/object:Gem::Dependency
|
64
46
|
name: nokogiri
|
65
|
-
|
66
|
-
|
67
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
50
|
requirements:
|
69
51
|
- - ">="
|
70
52
|
- !ruby/object:Gem::Version
|
71
|
-
hash: 3
|
72
|
-
segments:
|
73
|
-
- 0
|
74
53
|
version: "0"
|
75
|
-
|
76
|
-
version_requirements: *id004
|
54
|
+
version:
|
77
55
|
- !ruby/object:Gem::Dependency
|
78
56
|
name: sanitize
|
79
|
-
|
80
|
-
|
81
|
-
|
57
|
+
type: :runtime
|
58
|
+
version_requirement:
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
82
60
|
requirements:
|
83
61
|
- - ">="
|
84
62
|
- !ruby/object:Gem::Version
|
85
|
-
hash: 3
|
86
|
-
segments:
|
87
|
-
- 0
|
88
63
|
version: "0"
|
89
|
-
|
90
|
-
version_requirements: *id005
|
64
|
+
version:
|
91
65
|
- !ruby/object:Gem::Dependency
|
92
66
|
name: fast-stemmer
|
93
|
-
|
94
|
-
|
95
|
-
|
67
|
+
type: :runtime
|
68
|
+
version_requirement:
|
69
|
+
version_requirements: !ruby/object:Gem::Requirement
|
96
70
|
requirements:
|
97
71
|
- - ">="
|
98
72
|
- !ruby/object:Gem::Version
|
99
|
-
hash: 3
|
100
|
-
segments:
|
101
|
-
- 0
|
102
73
|
version: "0"
|
103
|
-
|
104
|
-
version_requirements: *id006
|
74
|
+
version:
|
105
75
|
- !ruby/object:Gem::Dependency
|
106
76
|
name: chronic
|
107
|
-
|
108
|
-
|
109
|
-
|
77
|
+
type: :runtime
|
78
|
+
version_requirement:
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
110
80
|
requirements:
|
111
81
|
- - ">="
|
112
82
|
- !ruby/object:Gem::Version
|
113
|
-
hash: 3
|
114
|
-
segments:
|
115
|
-
- 0
|
116
83
|
version: "0"
|
117
|
-
|
118
|
-
version_requirements: *id007
|
84
|
+
version:
|
119
85
|
description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
|
120
86
|
email: git@peterc.org
|
121
87
|
executables:
|
@@ -172,27 +138,21 @@ rdoc_options:
|
|
172
138
|
require_paths:
|
173
139
|
- lib
|
174
140
|
required_ruby_version: !ruby/object:Gem::Requirement
|
175
|
-
none: false
|
176
141
|
requirements:
|
177
142
|
- - ">="
|
178
143
|
- !ruby/object:Gem::Version
|
179
|
-
hash: 3
|
180
|
-
segments:
|
181
|
-
- 0
|
182
144
|
version: "0"
|
145
|
+
version:
|
183
146
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
184
|
-
none: false
|
185
147
|
requirements:
|
186
148
|
- - ">="
|
187
149
|
- !ruby/object:Gem::Version
|
188
|
-
hash: 3
|
189
|
-
segments:
|
190
|
-
- 0
|
191
150
|
version: "0"
|
151
|
+
version:
|
192
152
|
requirements: []
|
193
153
|
|
194
154
|
rubyforge_project:
|
195
|
-
rubygems_version: 1.3.
|
155
|
+
rubygems_version: 1.3.5
|
196
156
|
signing_key:
|
197
157
|
specification_version: 3
|
198
158
|
summary: Extracts or retrieves content-related metadata from HTML pages
|