pismo 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/pismo.rb +7 -2
- data/lib/pismo/internal_attributes.rb +7 -5
- data/lib/pismo/readability.rb +4 -4
- data/lib/pismo/stopwords.txt +4 -1
- data/pismo.gemspec +3 -2
- data/test/corpus/metadata_expected.yaml +6 -47
- data/test/corpus/metadata_expected.yaml.old +122 -0
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
data/lib/pismo.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'open-uri'
|
2
4
|
require 'nokogiri'
|
3
5
|
require 'fast_stemmer'
|
@@ -33,9 +35,12 @@ class Nokogiri::HTML::Document
|
|
33
35
|
elsif query.is_a?(Array)
|
34
36
|
result = query[1].call(self.search(query.first).first).strip rescue nil
|
35
37
|
end
|
38
|
+
|
36
39
|
if result
|
37
|
-
|
38
|
-
|
40
|
+
# result.gsub!(/\342\200\231/, '\'')
|
41
|
+
# result.gsub!(/\342\200\224/, '-')
|
42
|
+
result.gsub!('’', '\'')
|
43
|
+
result.gsub!('—', '-')
|
39
44
|
return result
|
40
45
|
end
|
41
46
|
end
|
@@ -3,6 +3,7 @@ module Pismo
|
|
3
3
|
module InternalAttributes
|
4
4
|
# Returns the title of the page/content - attempts to strip site name, etc, if possible
|
5
5
|
def title
|
6
|
+
# TODO: Memoizations
|
6
7
|
title = @doc.match( 'h2.title',
|
7
8
|
'.entry h2', # Common style
|
8
9
|
'.entryheader h1', # Ruby Inside/Kubrick
|
@@ -147,12 +148,13 @@ module Pismo
|
|
147
148
|
|
148
149
|
words = {}
|
149
150
|
|
150
|
-
# Convert doc to lowercase, scrub out most HTML tags
|
151
|
-
|
151
|
+
# Convert doc to lowercase, scrub out most HTML tags, then keep track of words
|
152
|
+
cached_title = title
|
153
|
+
body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\+\#\.]*\b/).each do |word|
|
152
154
|
next if word.length > options[:word_length_limit]
|
153
155
|
word.gsub!(/\'\w+/, '')
|
154
156
|
words[word] ||= 0
|
155
|
-
words[word] += 1
|
157
|
+
words[word] += (cached_title =~ /#{word}/i ? 5 : 1)
|
156
158
|
end
|
157
159
|
|
158
160
|
# Stem the words and stop words if necessary
|
@@ -168,8 +170,8 @@ module Pismo
|
|
168
170
|
@body ||= Readability::Document.new(@doc.to_s).content.strip
|
169
171
|
|
170
172
|
# HACK: Remove annoying DIV that readability leaves around
|
171
|
-
@body.
|
172
|
-
@body.
|
173
|
+
@body.sub!(/\A\<div\>/, '')
|
174
|
+
@body.sub!(/\<\/div\>\Z/, '')
|
173
175
|
|
174
176
|
return @body
|
175
177
|
end
|
data/lib/pismo/readability.rb
CHANGED
@@ -174,13 +174,13 @@ module Readability
|
|
174
174
|
def score_node(elem)
|
175
175
|
content_score = class_weight(elem)
|
176
176
|
case elem.name.downcase
|
177
|
-
when "div"
|
177
|
+
when "div"
|
178
178
|
content_score += 5
|
179
|
-
when "blockquote"
|
179
|
+
when "blockquote"
|
180
180
|
content_score += 3
|
181
|
-
when "form"
|
181
|
+
when "form"
|
182
182
|
content_score -= 3
|
183
|
-
when "th"
|
183
|
+
when "th"
|
184
184
|
content_score -= 5
|
185
185
|
end
|
186
186
|
{ :content_score => content_score, :elem => elem }
|
data/lib/pismo/stopwords.txt
CHANGED
data/pismo.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
12
|
-
s.date = %q{2010-03
|
12
|
+
s.date = %q{2010-05-03}
|
13
13
|
s.default_executable = %q{pismo}
|
14
14
|
s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
|
15
15
|
s.email = %q{git@peterc.org}
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |s|
|
|
39
39
|
"test/corpus/factor.html",
|
40
40
|
"test/corpus/huffington.html",
|
41
41
|
"test/corpus/metadata_expected.yaml",
|
42
|
+
"test/corpus/metadata_expected.yaml.old",
|
42
43
|
"test/corpus/rubyinside.html",
|
43
44
|
"test/corpus/rww.html",
|
44
45
|
"test/corpus/spolsky.html",
|
@@ -1,10 +1,4 @@
|
|
1
1
|
---
|
2
|
-
:rubyinside:
|
3
|
-
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
4
|
-
:author: Peter Cooper
|
5
|
-
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
6
|
-
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
7
|
-
:feed: http://www.rubyinside.com/feed/
|
8
2
|
:rww:
|
9
3
|
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
10
4
|
:feed: http://www.readwriteweb.com/rss.xml
|
@@ -38,44 +32,9 @@
|
|
38
32
|
:techcrunch:
|
39
33
|
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
40
34
|
:author: MG Siegler
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
- 3
|
48
|
-
- - logo
|
49
|
-
- 2
|
50
|
-
- - font
|
51
|
-
- 2
|
52
|
-
- - india
|
53
|
-
- 2
|
54
|
-
- - surprised
|
55
|
-
- 1
|
56
|
-
- - week
|
57
|
-
- 1
|
58
|
-
- - switched
|
59
|
-
- 1
|
60
|
-
- - school
|
61
|
-
- 1
|
62
|
-
- - things
|
63
|
-
- 1
|
64
|
-
- - removing
|
65
|
-
- 1
|
66
|
-
- - steve
|
67
|
-
- 1
|
68
|
-
- - decided
|
69
|
-
- 1
|
70
|
-
- - advantage
|
71
|
-
- 1
|
72
|
-
- - wasn
|
73
|
-
- 1
|
74
|
-
- - accepting
|
75
|
-
- 1
|
76
|
-
- - red
|
77
|
-
- 1
|
78
|
-
- - copy
|
79
|
-
- 1
|
80
|
-
- - wouldn
|
81
|
-
- 1
|
35
|
+
:rubyinside:
|
36
|
+
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
37
|
+
:author: Peter Cooper
|
38
|
+
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
39
|
+
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
40
|
+
:feed: http://www.rubyinside.com/feed/
|
@@ -0,0 +1,122 @@
|
|
1
|
+
---
|
2
|
+
:rww:
|
3
|
+
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
|
+
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
+
:briancray:
|
6
|
+
:title: 5 great examples of popular blog posts that you should know
|
7
|
+
:feed: http://feeds.feedburner.com/briancray/blog
|
8
|
+
:lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
|
9
|
+
:huffington:
|
10
|
+
:title: Afghans Losing Hope After 8 Years Of War
|
11
|
+
:author: TODD PITMAN
|
12
|
+
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
|
+
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
|
+
:bbcnews:
|
15
|
+
:title: Gay Muslims made homeless by family violence
|
16
|
+
:author: Poonam Taneja
|
17
|
+
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
|
+
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
|
+
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
|
+
:factor:
|
21
|
+
:title: Factor's bootstrap process explained
|
22
|
+
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
23
|
+
:youtube:
|
24
|
+
:title: YMO - Rydeen (Official Video)
|
25
|
+
:author: ymo1965
|
26
|
+
:spolsky:
|
27
|
+
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
|
+
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
29
|
+
:author: Joel Spolsky
|
30
|
+
:favicon: /favicon.ico
|
31
|
+
:feed: http://www.joelonsoftware.com/rss.xml
|
32
|
+
:techcrunch:
|
33
|
+
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
34
|
+
:author: MG Siegler
|
35
|
+
:keywords:
|
36
|
+
- - googlle
|
37
|
+
- 35
|
38
|
+
- - logo
|
39
|
+
- 10
|
40
|
+
- - google
|
41
|
+
- 6
|
42
|
+
- - site
|
43
|
+
- 3
|
44
|
+
- - font
|
45
|
+
- 2
|
46
|
+
- - india
|
47
|
+
- 2
|
48
|
+
- - surprised
|
49
|
+
- 1
|
50
|
+
- - week
|
51
|
+
- 1
|
52
|
+
- - switched
|
53
|
+
- 1
|
54
|
+
- - school
|
55
|
+
- 1
|
56
|
+
- - things
|
57
|
+
- 1
|
58
|
+
- - removing
|
59
|
+
- 1
|
60
|
+
- - steve
|
61
|
+
- 1
|
62
|
+
- - decided
|
63
|
+
- 1
|
64
|
+
- - advantage
|
65
|
+
- 1
|
66
|
+
- - wasn
|
67
|
+
- 1
|
68
|
+
- - accepting
|
69
|
+
- 1
|
70
|
+
- - red
|
71
|
+
- 1
|
72
|
+
- - copy
|
73
|
+
- 1
|
74
|
+
- - wouldn
|
75
|
+
- 1
|
76
|
+
:rubyinside:
|
77
|
+
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
78
|
+
:author: Peter Cooper
|
79
|
+
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
80
|
+
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
81
|
+
:feed: http://www.rubyinside.com/feed/
|
82
|
+
:keywords:
|
83
|
+
- - ruby
|
84
|
+
- 15
|
85
|
+
- - coffeescript
|
86
|
+
- 15
|
87
|
+
- - compiler
|
88
|
+
- 10
|
89
|
+
- - language
|
90
|
+
- 10
|
91
|
+
- - coffee
|
92
|
+
- 5
|
93
|
+
- - pure
|
94
|
+
- 5
|
95
|
+
- - code
|
96
|
+
- 5
|
97
|
+
- - script
|
98
|
+
- 5
|
99
|
+
- - javascript
|
100
|
+
- 3
|
101
|
+
- - github
|
102
|
+
- 2
|
103
|
+
- - syntax
|
104
|
+
- 1
|
105
|
+
- - programming
|
106
|
+
- 1
|
107
|
+
- - brother
|
108
|
+
- 1
|
109
|
+
- - constructs
|
110
|
+
- 1
|
111
|
+
- - vanilla
|
112
|
+
- 1
|
113
|
+
- - parser
|
114
|
+
- 1
|
115
|
+
- - lexer
|
116
|
+
- 1
|
117
|
+
- - project
|
118
|
+
- 1
|
119
|
+
- - installed
|
120
|
+
- 1
|
121
|
+
- - simple
|
122
|
+
- 1
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Cooper
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-03
|
12
|
+
date: 2010-05-03 00:00:00 +01:00
|
13
13
|
default_executable: pismo
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -102,6 +102,7 @@ files:
|
|
102
102
|
- test/corpus/factor.html
|
103
103
|
- test/corpus/huffington.html
|
104
104
|
- test/corpus/metadata_expected.yaml
|
105
|
+
- test/corpus/metadata_expected.yaml.old
|
105
106
|
- test/corpus/rubyinside.html
|
106
107
|
- test/corpus/rww.html
|
107
108
|
- test/corpus/spolsky.html
|