pismo 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.markdown CHANGED
@@ -26,7 +26,9 @@ There's also a shorter "convenience" method which might be handy in IRB - it doe
26
26
 
27
27
  Pismo['http://www.rubyflow.com/items/4082'].title # => "Install Ruby as a non-root User"
28
28
 
29
- The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
29
+ The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #html_body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
30
+
31
+ The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader" (like Arc90's Readability or Safari Reader) algorithm. #body returns it as plain-text, #html_body maintains some basic HTML styling.
30
32
 
31
33
  ## CAUTIONS / WARNINGS:
32
34
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.6.1
@@ -189,12 +189,12 @@ module Pismo
189
189
  '.post-text p',
190
190
  '#blogpost p',
191
191
  '.story-teaser',
192
- '.subhead',
193
192
  '//div[@class="entrytext"]//p[string-length()>10]', # Ruby Inside / Kubrick style
194
193
  'section p',
195
194
  '.entry .text p',
196
195
  '.entry-content p',
197
196
  '#wikicontent p', # Google Code style
197
+ '.wikistyle p', # GitHub style
198
198
  '//td[@class="storybody"]/p[string-length()>10]', # BBC News style
199
199
  '//div[@class="entry"]//p[string-length()>100]',
200
200
  # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
@@ -206,16 +206,15 @@ module Pismo
206
206
  '#article p',
207
207
  '.post-body',
208
208
  '.entry-content',
209
- '.body p',
210
209
  '.document_description_short p', # Scribd
211
210
  '.single-post p'
212
211
  ], all)
213
212
 
214
213
  # TODO: Improve sentence extraction - this is dire even if it "works for now"
215
214
  if lede && String === lede
216
- return lede[/^(.*?[\.\!\?]\s){2}/m] || lede
215
+ return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
217
216
  elsif lede && Array === lede
218
- return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m] || l }.uniq
217
+ return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
219
218
  else
220
219
  return reader_doc && !reader_doc.sentences(2).empty? ? reader_doc.sentences(2).join(' ') : nil
221
220
  end
@@ -268,7 +267,12 @@ module Pismo
268
267
 
269
268
  # Returns body text as determined by Reader algorithm
270
269
  def body
271
- @body ||= reader_doc.content.strip
270
+ @body ||= reader_doc.content(true).strip
271
+ end
272
+
273
+ # Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
274
+ def html_body
275
+ @html_body ||= reader_doc.content.strip
272
276
  end
273
277
 
274
278
  # Returns URL to the site's favicon
data/pismo.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{pismo}
8
- s.version = "0.6.0"
8
+ s.version = "0.6.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Peter Cooper"]
@@ -59,7 +59,7 @@ Gem::Specification.new do |s|
59
59
  s.homepage = %q{http://github.com/peterc/pismo}
60
60
  s.rdoc_options = ["--charset=UTF-8"]
61
61
  s.require_paths = ["lib"]
62
- s.rubygems_version = %q{1.3.7}
62
+ s.rubygems_version = %q{1.3.5}
63
63
  s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
64
64
  s.test_files = [
65
65
  "test/helper.rb",
@@ -71,7 +71,7 @@ Gem::Specification.new do |s|
71
71
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
72
72
  s.specification_version = 3
73
73
 
74
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
74
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
75
75
  s.add_development_dependency(%q<shoulda>, [">= 0"])
76
76
  s.add_development_dependency(%q<awesome_print>, [">= 0"])
77
77
  s.add_runtime_dependency(%q<jeweler>, [">= 0"])
@@ -31,9 +31,9 @@
31
31
  :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
32
32
  :factor:
33
33
  :title: Factor's bootstrap process explained
34
- :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
34
+ :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
35
35
  :ledes:
36
- - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
36
+ - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
37
37
  :youtube:
38
38
  :title: YMO - Rydeen (Official Video)
39
39
  :author: ymo1965
@@ -68,6 +68,6 @@
68
68
  :sentences: I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself. Of course, there's no reason for us to use C++ features just because we can. The goal is a better compiler for users, not a C++ code base for its own sake.
69
69
  :queness:
70
70
  :title: 18 Incredible CSS3 Effects You Have Never Seen Before
71
- :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. "
71
+ :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web."
72
72
  :sentences: CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it.
73
73
  :datetime: 2010-06-02 12:00:00 +01:00
metadata CHANGED
@@ -1,13 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 6
9
- - 0
10
- version: 0.6.0
4
+ version: 0.6.1
11
5
  platform: ruby
12
6
  authors:
13
7
  - Peter Cooper
@@ -20,102 +14,74 @@ default_executable: pismo
20
14
  dependencies:
21
15
  - !ruby/object:Gem::Dependency
22
16
  name: shoulda
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
26
20
  requirements:
27
21
  - - ">="
28
22
  - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
23
  version: "0"
33
- type: :development
34
- version_requirements: *id001
24
+ version:
35
25
  - !ruby/object:Gem::Dependency
36
26
  name: awesome_print
37
- prerelease: false
38
- requirement: &id002 !ruby/object:Gem::Requirement
39
- none: false
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
40
30
  requirements:
41
31
  - - ">="
42
32
  - !ruby/object:Gem::Version
43
- hash: 3
44
- segments:
45
- - 0
46
33
  version: "0"
47
- type: :development
48
- version_requirements: *id002
34
+ version:
49
35
  - !ruby/object:Gem::Dependency
50
36
  name: jeweler
51
- prerelease: false
52
- requirement: &id003 !ruby/object:Gem::Requirement
53
- none: false
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
54
40
  requirements:
55
41
  - - ">="
56
42
  - !ruby/object:Gem::Version
57
- hash: 3
58
- segments:
59
- - 0
60
43
  version: "0"
61
- type: :runtime
62
- version_requirements: *id003
44
+ version:
63
45
  - !ruby/object:Gem::Dependency
64
46
  name: nokogiri
65
- prerelease: false
66
- requirement: &id004 !ruby/object:Gem::Requirement
67
- none: false
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
68
50
  requirements:
69
51
  - - ">="
70
52
  - !ruby/object:Gem::Version
71
- hash: 3
72
- segments:
73
- - 0
74
53
  version: "0"
75
- type: :runtime
76
- version_requirements: *id004
54
+ version:
77
55
  - !ruby/object:Gem::Dependency
78
56
  name: sanitize
79
- prerelease: false
80
- requirement: &id005 !ruby/object:Gem::Requirement
81
- none: false
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
82
60
  requirements:
83
61
  - - ">="
84
62
  - !ruby/object:Gem::Version
85
- hash: 3
86
- segments:
87
- - 0
88
63
  version: "0"
89
- type: :runtime
90
- version_requirements: *id005
64
+ version:
91
65
  - !ruby/object:Gem::Dependency
92
66
  name: fast-stemmer
93
- prerelease: false
94
- requirement: &id006 !ruby/object:Gem::Requirement
95
- none: false
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
96
70
  requirements:
97
71
  - - ">="
98
72
  - !ruby/object:Gem::Version
99
- hash: 3
100
- segments:
101
- - 0
102
73
  version: "0"
103
- type: :runtime
104
- version_requirements: *id006
74
+ version:
105
75
  - !ruby/object:Gem::Dependency
106
76
  name: chronic
107
- prerelease: false
108
- requirement: &id007 !ruby/object:Gem::Requirement
109
- none: false
77
+ type: :runtime
78
+ version_requirement:
79
+ version_requirements: !ruby/object:Gem::Requirement
110
80
  requirements:
111
81
  - - ">="
112
82
  - !ruby/object:Gem::Version
113
- hash: 3
114
- segments:
115
- - 0
116
83
  version: "0"
117
- type: :runtime
118
- version_requirements: *id007
84
+ version:
119
85
  description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
120
86
  email: git@peterc.org
121
87
  executables:
@@ -172,27 +138,21 @@ rdoc_options:
172
138
  require_paths:
173
139
  - lib
174
140
  required_ruby_version: !ruby/object:Gem::Requirement
175
- none: false
176
141
  requirements:
177
142
  - - ">="
178
143
  - !ruby/object:Gem::Version
179
- hash: 3
180
- segments:
181
- - 0
182
144
  version: "0"
145
+ version:
183
146
  required_rubygems_version: !ruby/object:Gem::Requirement
184
- none: false
185
147
  requirements:
186
148
  - - ">="
187
149
  - !ruby/object:Gem::Version
188
- hash: 3
189
- segments:
190
- - 0
191
150
  version: "0"
151
+ version:
192
152
  requirements: []
193
153
 
194
154
  rubyforge_project:
195
- rubygems_version: 1.3.7
155
+ rubygems_version: 1.3.5
196
156
  signing_key:
197
157
  specification_version: 3
198
158
  summary: Extracts or retrieves content-related metadata from HTML pages