pismo 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown CHANGED
@@ -26,7 +26,9 @@ There's also a shorter "convenience" method which might be handy in IRB - it doe
26
26
 
27
27
  Pismo['http://www.rubyflow.com/items/4082'].title # => "Install Ruby as a non-root User"
28
28
 
29
- The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
29
+ The current metadata methods are #title, #titles, #author, #authors, #lede, #keywords, #sentences(qty), #body, #html_body, #feed, #feeds, #favicon, #description and #datetime. These are not fully documented here yet, you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
30
+
31
+ The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader" (like Arc90's Readability or Safari Reader) algorithm. #body returns it as plain-text, #html_body maintains some basic HTML styling.
30
32
 
31
33
  ## CAUTIONS / WARNINGS:
32
34
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.0
1
+ 0.6.1
@@ -189,12 +189,12 @@ module Pismo
189
189
  '.post-text p',
190
190
  '#blogpost p',
191
191
  '.story-teaser',
192
- '.subhead',
193
192
  '//div[@class="entrytext"]//p[string-length()>10]', # Ruby Inside / Kubrick style
194
193
  'section p',
195
194
  '.entry .text p',
196
195
  '.entry-content p',
197
196
  '#wikicontent p', # Google Code style
197
+ '.wikistyle p', # GitHub style
198
198
  '//td[@class="storybody"]/p[string-length()>10]', # BBC News style
199
199
  '//div[@class="entry"]//p[string-length()>100]',
200
200
  # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
@@ -206,16 +206,15 @@ module Pismo
206
206
  '#article p',
207
207
  '.post-body',
208
208
  '.entry-content',
209
- '.body p',
210
209
  '.document_description_short p', # Scribd
211
210
  '.single-post p'
212
211
  ], all)
213
212
 
214
213
  # TODO: Improve sentence extraction - this is dire even if it "works for now"
215
214
  if lede && String === lede
216
- return lede[/^(.*?[\.\!\?]\s){2}/m] || lede
215
+ return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
217
216
  elsif lede && Array === lede
218
- return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m] || l }.uniq
217
+ return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
219
218
  else
220
219
  return reader_doc && !reader_doc.sentences(2).empty? ? reader_doc.sentences(2).join(' ') : nil
221
220
  end
@@ -268,7 +267,12 @@ module Pismo
268
267
 
269
268
  # Returns body text as determined by Reader algorithm
270
269
  def body
271
- @body ||= reader_doc.content.strip
270
+ @body ||= reader_doc.content(true).strip
271
+ end
272
+
273
+ # Returns body text as determined by Reader algorithm WITH basic HTML formatting intact
274
+ def html_body
275
+ @html_body ||= reader_doc.content.strip
272
276
  end
273
277
 
274
278
  # Returns URL to the site's favicon
data/pismo.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{pismo}
8
- s.version = "0.6.0"
8
+ s.version = "0.6.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Peter Cooper"]
@@ -59,7 +59,7 @@ Gem::Specification.new do |s|
59
59
  s.homepage = %q{http://github.com/peterc/pismo}
60
60
  s.rdoc_options = ["--charset=UTF-8"]
61
61
  s.require_paths = ["lib"]
62
- s.rubygems_version = %q{1.3.7}
62
+ s.rubygems_version = %q{1.3.5}
63
63
  s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
64
64
  s.test_files = [
65
65
  "test/helper.rb",
@@ -71,7 +71,7 @@ Gem::Specification.new do |s|
71
71
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
72
72
  s.specification_version = 3
73
73
 
74
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
74
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
75
75
  s.add_development_dependency(%q<shoulda>, [">= 0"])
76
76
  s.add_development_dependency(%q<awesome_print>, [">= 0"])
77
77
  s.add_runtime_dependency(%q<jeweler>, [">= 0"])
@@ -31,9 +31,9 @@
31
31
  :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
32
32
  :factor:
33
33
  :title: Factor's bootstrap process explained
34
- :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
34
+ :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
35
35
  :ledes:
36
- - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
36
+ - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
37
37
  :youtube:
38
38
  :title: YMO - Rydeen (Official Video)
39
39
  :author: ymo1965
@@ -68,6 +68,6 @@
68
68
  :sentences: I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself. Of course, there's no reason for us to use C++ features just because we can. The goal is a better compiler for users, not a C++ code base for its own sake.
69
69
  :queness:
70
70
  :title: 18 Incredible CSS3 Effects You Have Never Seen Before
71
- :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. "
71
+ :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web."
72
72
  :sentences: CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it.
73
73
  :datetime: 2010-06-02 12:00:00 +01:00
metadata CHANGED
@@ -1,13 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
5
- prerelease: false
6
- segments:
7
- - 0
8
- - 6
9
- - 0
10
- version: 0.6.0
4
+ version: 0.6.1
11
5
  platform: ruby
12
6
  authors:
13
7
  - Peter Cooper
@@ -20,102 +14,74 @@ default_executable: pismo
20
14
  dependencies:
21
15
  - !ruby/object:Gem::Dependency
22
16
  name: shoulda
23
- prerelease: false
24
- requirement: &id001 !ruby/object:Gem::Requirement
25
- none: false
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
26
20
  requirements:
27
21
  - - ">="
28
22
  - !ruby/object:Gem::Version
29
- hash: 3
30
- segments:
31
- - 0
32
23
  version: "0"
33
- type: :development
34
- version_requirements: *id001
24
+ version:
35
25
  - !ruby/object:Gem::Dependency
36
26
  name: awesome_print
37
- prerelease: false
38
- requirement: &id002 !ruby/object:Gem::Requirement
39
- none: false
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
40
30
  requirements:
41
31
  - - ">="
42
32
  - !ruby/object:Gem::Version
43
- hash: 3
44
- segments:
45
- - 0
46
33
  version: "0"
47
- type: :development
48
- version_requirements: *id002
34
+ version:
49
35
  - !ruby/object:Gem::Dependency
50
36
  name: jeweler
51
- prerelease: false
52
- requirement: &id003 !ruby/object:Gem::Requirement
53
- none: false
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
54
40
  requirements:
55
41
  - - ">="
56
42
  - !ruby/object:Gem::Version
57
- hash: 3
58
- segments:
59
- - 0
60
43
  version: "0"
61
- type: :runtime
62
- version_requirements: *id003
44
+ version:
63
45
  - !ruby/object:Gem::Dependency
64
46
  name: nokogiri
65
- prerelease: false
66
- requirement: &id004 !ruby/object:Gem::Requirement
67
- none: false
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
68
50
  requirements:
69
51
  - - ">="
70
52
  - !ruby/object:Gem::Version
71
- hash: 3
72
- segments:
73
- - 0
74
53
  version: "0"
75
- type: :runtime
76
- version_requirements: *id004
54
+ version:
77
55
  - !ruby/object:Gem::Dependency
78
56
  name: sanitize
79
- prerelease: false
80
- requirement: &id005 !ruby/object:Gem::Requirement
81
- none: false
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
82
60
  requirements:
83
61
  - - ">="
84
62
  - !ruby/object:Gem::Version
85
- hash: 3
86
- segments:
87
- - 0
88
63
  version: "0"
89
- type: :runtime
90
- version_requirements: *id005
64
+ version:
91
65
  - !ruby/object:Gem::Dependency
92
66
  name: fast-stemmer
93
- prerelease: false
94
- requirement: &id006 !ruby/object:Gem::Requirement
95
- none: false
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
96
70
  requirements:
97
71
  - - ">="
98
72
  - !ruby/object:Gem::Version
99
- hash: 3
100
- segments:
101
- - 0
102
73
  version: "0"
103
- type: :runtime
104
- version_requirements: *id006
74
+ version:
105
75
  - !ruby/object:Gem::Dependency
106
76
  name: chronic
107
- prerelease: false
108
- requirement: &id007 !ruby/object:Gem::Requirement
109
- none: false
77
+ type: :runtime
78
+ version_requirement:
79
+ version_requirements: !ruby/object:Gem::Requirement
110
80
  requirements:
111
81
  - - ">="
112
82
  - !ruby/object:Gem::Version
113
- hash: 3
114
- segments:
115
- - 0
116
83
  version: "0"
117
- type: :runtime
118
- version_requirements: *id007
84
+ version:
119
85
  description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
120
86
  email: git@peterc.org
121
87
  executables:
@@ -172,27 +138,21 @@ rdoc_options:
172
138
  require_paths:
173
139
  - lib
174
140
  required_ruby_version: !ruby/object:Gem::Requirement
175
- none: false
176
141
  requirements:
177
142
  - - ">="
178
143
  - !ruby/object:Gem::Version
179
- hash: 3
180
- segments:
181
- - 0
182
144
  version: "0"
145
+ version:
183
146
  required_rubygems_version: !ruby/object:Gem::Requirement
184
- none: false
185
147
  requirements:
186
148
  - - ">="
187
149
  - !ruby/object:Gem::Version
188
- hash: 3
189
- segments:
190
- - 0
191
150
  version: "0"
151
+ version:
192
152
  requirements: []
193
153
 
194
154
  rubyforge_project:
195
- rubygems_version: 1.3.7
155
+ rubygems_version: 1.3.5
196
156
  signing_key:
197
157
  specification_version: 3
198
158
  summary: Extracts or retrieves content-related metadata from HTML pages