pismo 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/pismo.rb +7 -2
- data/lib/pismo/internal_attributes.rb +7 -5
- data/lib/pismo/readability.rb +4 -4
- data/lib/pismo/stopwords.txt +4 -1
- data/pismo.gemspec +3 -2
- data/test/corpus/metadata_expected.yaml +6 -47
- data/test/corpus/metadata_expected.yaml.old +122 -0
- metadata +3 -2
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
data/lib/pismo.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
1
3
|
require 'open-uri'
|
2
4
|
require 'nokogiri'
|
3
5
|
require 'fast_stemmer'
|
@@ -33,9 +35,12 @@ class Nokogiri::HTML::Document
|
|
33
35
|
elsif query.is_a?(Array)
|
34
36
|
result = query[1].call(self.search(query.first).first).strip rescue nil
|
35
37
|
end
|
38
|
+
|
36
39
|
if result
|
37
|
-
|
38
|
-
|
40
|
+
# result.gsub!(/\342\200\231/, '\'')
|
41
|
+
# result.gsub!(/\342\200\224/, '-')
|
42
|
+
result.gsub!('’', '\'')
|
43
|
+
result.gsub!('—', '-')
|
39
44
|
return result
|
40
45
|
end
|
41
46
|
end
|
@@ -3,6 +3,7 @@ module Pismo
|
|
3
3
|
module InternalAttributes
|
4
4
|
# Returns the title of the page/content - attempts to strip site name, etc, if possible
|
5
5
|
def title
|
6
|
+
# TODO: Memoizations
|
6
7
|
title = @doc.match( 'h2.title',
|
7
8
|
'.entry h2', # Common style
|
8
9
|
'.entryheader h1', # Ruby Inside/Kubrick
|
@@ -147,12 +148,13 @@ module Pismo
|
|
147
148
|
|
148
149
|
words = {}
|
149
150
|
|
150
|
-
# Convert doc to lowercase, scrub out most HTML tags
|
151
|
-
|
151
|
+
# Convert doc to lowercase, scrub out most HTML tags, then keep track of words
|
152
|
+
cached_title = title
|
153
|
+
body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\+\#\.]*\b/).each do |word|
|
152
154
|
next if word.length > options[:word_length_limit]
|
153
155
|
word.gsub!(/\'\w+/, '')
|
154
156
|
words[word] ||= 0
|
155
|
-
words[word] += 1
|
157
|
+
words[word] += (cached_title =~ /#{word}/i ? 5 : 1)
|
156
158
|
end
|
157
159
|
|
158
160
|
# Stem the words and stop words if necessary
|
@@ -168,8 +170,8 @@ module Pismo
|
|
168
170
|
@body ||= Readability::Document.new(@doc.to_s).content.strip
|
169
171
|
|
170
172
|
# HACK: Remove annoying DIV that readability leaves around
|
171
|
-
@body.
|
172
|
-
@body.
|
173
|
+
@body.sub!(/\A\<div\>/, '')
|
174
|
+
@body.sub!(/\<\/div\>\Z/, '')
|
173
175
|
|
174
176
|
return @body
|
175
177
|
end
|
data/lib/pismo/readability.rb
CHANGED
@@ -174,13 +174,13 @@ module Readability
|
|
174
174
|
def score_node(elem)
|
175
175
|
content_score = class_weight(elem)
|
176
176
|
case elem.name.downcase
|
177
|
-
when "div"
|
177
|
+
when "div"
|
178
178
|
content_score += 5
|
179
|
-
when "blockquote"
|
179
|
+
when "blockquote"
|
180
180
|
content_score += 3
|
181
|
-
when "form"
|
181
|
+
when "form"
|
182
182
|
content_score -= 3
|
183
|
-
when "th"
|
183
|
+
when "th"
|
184
184
|
content_score -= 5
|
185
185
|
end
|
186
186
|
{ :content_score => content_score, :elem => elem }
|
data/lib/pismo/stopwords.txt
CHANGED
data/pismo.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
12
|
-
s.date = %q{2010-03
|
12
|
+
s.date = %q{2010-05-03}
|
13
13
|
s.default_executable = %q{pismo}
|
14
14
|
s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
|
15
15
|
s.email = %q{git@peterc.org}
|
@@ -39,6 +39,7 @@ Gem::Specification.new do |s|
|
|
39
39
|
"test/corpus/factor.html",
|
40
40
|
"test/corpus/huffington.html",
|
41
41
|
"test/corpus/metadata_expected.yaml",
|
42
|
+
"test/corpus/metadata_expected.yaml.old",
|
42
43
|
"test/corpus/rubyinside.html",
|
43
44
|
"test/corpus/rww.html",
|
44
45
|
"test/corpus/spolsky.html",
|
@@ -1,10 +1,4 @@
|
|
1
1
|
---
|
2
|
-
:rubyinside:
|
3
|
-
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
4
|
-
:author: Peter Cooper
|
5
|
-
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
6
|
-
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
7
|
-
:feed: http://www.rubyinside.com/feed/
|
8
2
|
:rww:
|
9
3
|
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
10
4
|
:feed: http://www.readwriteweb.com/rss.xml
|
@@ -38,44 +32,9 @@
|
|
38
32
|
:techcrunch:
|
39
33
|
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
40
34
|
:author: MG Siegler
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
- 3
|
48
|
-
- - logo
|
49
|
-
- 2
|
50
|
-
- - font
|
51
|
-
- 2
|
52
|
-
- - india
|
53
|
-
- 2
|
54
|
-
- - surprised
|
55
|
-
- 1
|
56
|
-
- - week
|
57
|
-
- 1
|
58
|
-
- - switched
|
59
|
-
- 1
|
60
|
-
- - school
|
61
|
-
- 1
|
62
|
-
- - things
|
63
|
-
- 1
|
64
|
-
- - removing
|
65
|
-
- 1
|
66
|
-
- - steve
|
67
|
-
- 1
|
68
|
-
- - decided
|
69
|
-
- 1
|
70
|
-
- - advantage
|
71
|
-
- 1
|
72
|
-
- - wasn
|
73
|
-
- 1
|
74
|
-
- - accepting
|
75
|
-
- 1
|
76
|
-
- - red
|
77
|
-
- 1
|
78
|
-
- - copy
|
79
|
-
- 1
|
80
|
-
- - wouldn
|
81
|
-
- 1
|
35
|
+
:rubyinside:
|
36
|
+
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
37
|
+
:author: Peter Cooper
|
38
|
+
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
39
|
+
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
40
|
+
:feed: http://www.rubyinside.com/feed/
|
@@ -0,0 +1,122 @@
|
|
1
|
+
---
|
2
|
+
:rww:
|
3
|
+
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
|
+
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
+
:briancray:
|
6
|
+
:title: 5 great examples of popular blog posts that you should know
|
7
|
+
:feed: http://feeds.feedburner.com/briancray/blog
|
8
|
+
:lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
|
9
|
+
:huffington:
|
10
|
+
:title: Afghans Losing Hope After 8 Years Of War
|
11
|
+
:author: TODD PITMAN
|
12
|
+
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
|
+
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
|
+
:bbcnews:
|
15
|
+
:title: Gay Muslims made homeless by family violence
|
16
|
+
:author: Poonam Taneja
|
17
|
+
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
|
+
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
|
+
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
|
+
:factor:
|
21
|
+
:title: Factor's bootstrap process explained
|
22
|
+
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
23
|
+
:youtube:
|
24
|
+
:title: YMO - Rydeen (Official Video)
|
25
|
+
:author: ymo1965
|
26
|
+
:spolsky:
|
27
|
+
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
|
+
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
29
|
+
:author: Joel Spolsky
|
30
|
+
:favicon: /favicon.ico
|
31
|
+
:feed: http://www.joelonsoftware.com/rss.xml
|
32
|
+
:techcrunch:
|
33
|
+
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
34
|
+
:author: MG Siegler
|
35
|
+
:keywords:
|
36
|
+
- - googlle
|
37
|
+
- 35
|
38
|
+
- - logo
|
39
|
+
- 10
|
40
|
+
- - google
|
41
|
+
- 6
|
42
|
+
- - site
|
43
|
+
- 3
|
44
|
+
- - font
|
45
|
+
- 2
|
46
|
+
- - india
|
47
|
+
- 2
|
48
|
+
- - surprised
|
49
|
+
- 1
|
50
|
+
- - week
|
51
|
+
- 1
|
52
|
+
- - switched
|
53
|
+
- 1
|
54
|
+
- - school
|
55
|
+
- 1
|
56
|
+
- - things
|
57
|
+
- 1
|
58
|
+
- - removing
|
59
|
+
- 1
|
60
|
+
- - steve
|
61
|
+
- 1
|
62
|
+
- - decided
|
63
|
+
- 1
|
64
|
+
- - advantage
|
65
|
+
- 1
|
66
|
+
- - wasn
|
67
|
+
- 1
|
68
|
+
- - accepting
|
69
|
+
- 1
|
70
|
+
- - red
|
71
|
+
- 1
|
72
|
+
- - copy
|
73
|
+
- 1
|
74
|
+
- - wouldn
|
75
|
+
- 1
|
76
|
+
:rubyinside:
|
77
|
+
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
78
|
+
:author: Peter Cooper
|
79
|
+
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
80
|
+
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
81
|
+
:feed: http://www.rubyinside.com/feed/
|
82
|
+
:keywords:
|
83
|
+
- - ruby
|
84
|
+
- 15
|
85
|
+
- - coffeescript
|
86
|
+
- 15
|
87
|
+
- - compiler
|
88
|
+
- 10
|
89
|
+
- - language
|
90
|
+
- 10
|
91
|
+
- - coffee
|
92
|
+
- 5
|
93
|
+
- - pure
|
94
|
+
- 5
|
95
|
+
- - code
|
96
|
+
- 5
|
97
|
+
- - script
|
98
|
+
- 5
|
99
|
+
- - javascript
|
100
|
+
- 3
|
101
|
+
- - github
|
102
|
+
- 2
|
103
|
+
- - syntax
|
104
|
+
- 1
|
105
|
+
- - programming
|
106
|
+
- 1
|
107
|
+
- - brother
|
108
|
+
- 1
|
109
|
+
- - constructs
|
110
|
+
- 1
|
111
|
+
- - vanilla
|
112
|
+
- 1
|
113
|
+
- - parser
|
114
|
+
- 1
|
115
|
+
- - lexer
|
116
|
+
- 1
|
117
|
+
- - project
|
118
|
+
- 1
|
119
|
+
- - installed
|
120
|
+
- 1
|
121
|
+
- - simple
|
122
|
+
- 1
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Cooper
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-03
|
12
|
+
date: 2010-05-03 00:00:00 +01:00
|
13
13
|
default_executable: pismo
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -102,6 +102,7 @@ files:
|
|
102
102
|
- test/corpus/factor.html
|
103
103
|
- test/corpus/huffington.html
|
104
104
|
- test/corpus/metadata_expected.yaml
|
105
|
+
- test/corpus/metadata_expected.yaml.old
|
105
106
|
- test/corpus/rubyinside.html
|
106
107
|
- test/corpus/rww.html
|
107
108
|
- test/corpus/spolsky.html
|