pismo 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.2.3
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'open-uri'
2
4
  require 'nokogiri'
3
5
  require 'fast_stemmer'
@@ -33,9 +35,12 @@ class Nokogiri::HTML::Document
33
35
  elsif query.is_a?(Array)
34
36
  result = query[1].call(self.search(query.first).first).strip rescue nil
35
37
  end
38
+
36
39
  if result
37
- result.gsub!(/\342\200\231/, '\'')
38
- result.gsub!(/\342\200\224/, '-')
40
+ # result.gsub!(/\342\200\231/, '\'')
41
+ # result.gsub!(/\342\200\224/, '-')
42
+ result.gsub!('’', '\'')
43
+ result.gsub!('—', '-')
39
44
  return result
40
45
  end
41
46
  end
@@ -3,6 +3,7 @@ module Pismo
3
3
  module InternalAttributes
4
4
  # Returns the title of the page/content - attempts to strip site name, etc, if possible
5
5
  def title
6
+ # TODO: Memoizations
6
7
  title = @doc.match( 'h2.title',
7
8
  '.entry h2', # Common style
8
9
  '.entryheader h1', # Ruby Inside/Kubrick
@@ -147,12 +148,13 @@ module Pismo
147
148
 
148
149
  words = {}
149
150
 
150
- # Convert doc to lowercase, scrub out most HTML tags
151
- body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\#\.]*\b/).each do |word|
151
+ # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
152
+ cached_title = title
153
+ body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\+\#\.]*\b/).each do |word|
152
154
  next if word.length > options[:word_length_limit]
153
155
  word.gsub!(/\'\w+/, '')
154
156
  words[word] ||= 0
155
- words[word] += 1
157
+ words[word] += (cached_title =~ /#{word}/i ? 5 : 1)
156
158
  end
157
159
 
158
160
  # Stem the words and stop words if necessary
@@ -168,8 +170,8 @@ module Pismo
168
170
  @body ||= Readability::Document.new(@doc.to_s).content.strip
169
171
 
170
172
  # HACK: Remove annoying DIV that readability leaves around
171
- @body.gsub!(/\A\<div\>/, '')
172
- @body.gsub!(/\<\/div\>\Z/, '')
173
+ @body.sub!(/\A\<div\>/, '')
174
+ @body.sub!(/\<\/div\>\Z/, '')
173
175
 
174
176
  return @body
175
177
  end
@@ -174,13 +174,13 @@ module Readability
174
174
  def score_node(elem)
175
175
  content_score = class_weight(elem)
176
176
  case elem.name.downcase
177
- when "div":
177
+ when "div"
178
178
  content_score += 5
179
- when "blockquote":
179
+ when "blockquote"
180
180
  content_score += 3
181
- when "form":
181
+ when "form"
182
182
  content_score -= 3
183
- when "th":
183
+ when "th"
184
184
  content_score -= 5
185
185
  end
186
186
  { :content_score => content_score, :elem => elem }
@@ -890,4 +890,7 @@ Erin
890
890
  Nevaeh
891
891
  Brooklyn
892
892
  Marissa
893
- yeah
893
+ yeah
894
+ covering
895
+ kid
896
+ notably
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{pismo}
8
- s.version = "0.2.2"
8
+ s.version = "0.2.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Peter Cooper"]
12
- s.date = %q{2010-03-26}
12
+ s.date = %q{2010-05-03}
13
13
  s.default_executable = %q{pismo}
14
14
  s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
15
15
  s.email = %q{git@peterc.org}
@@ -39,6 +39,7 @@ Gem::Specification.new do |s|
39
39
  "test/corpus/factor.html",
40
40
  "test/corpus/huffington.html",
41
41
  "test/corpus/metadata_expected.yaml",
42
+ "test/corpus/metadata_expected.yaml.old",
42
43
  "test/corpus/rubyinside.html",
43
44
  "test/corpus/rww.html",
44
45
  "test/corpus/spolsky.html",
@@ -1,10 +1,4 @@
1
1
  ---
2
- :rubyinside:
3
- :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
4
- :author: Peter Cooper
5
- :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
6
- :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
7
- :feed: http://www.rubyinside.com/feed/
8
2
  :rww:
9
3
  :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
10
4
  :feed: http://www.readwriteweb.com/rss.xml
@@ -38,44 +32,9 @@
38
32
  :techcrunch:
39
33
  :title: Googlle Gets A Sexy New Logo; Remains Sketchy
40
34
  :author: MG Siegler
41
- :keywords:
42
- - - googlle
43
- - 7
44
- - - google
45
- - 6
46
- - - site
47
- - 3
48
- - - logo
49
- - 2
50
- - - font
51
- - 2
52
- - - india
53
- - 2
54
- - - surprised
55
- - 1
56
- - - week
57
- - 1
58
- - - switched
59
- - 1
60
- - - school
61
- - 1
62
- - - things
63
- - 1
64
- - - removing
65
- - 1
66
- - - steve
67
- - 1
68
- - - decided
69
- - 1
70
- - - advantage
71
- - 1
72
- - - wasn
73
- - 1
74
- - - accepting
75
- - 1
76
- - - red
77
- - 1
78
- - - copy
79
- - 1
80
- - - wouldn
81
- - 1
35
+ :rubyinside:
36
+ :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
37
+ :author: Peter Cooper
38
+ :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
39
+ :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
40
+ :feed: http://www.rubyinside.com/feed/
@@ -0,0 +1,122 @@
1
+ ---
2
+ :rww:
3
+ :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
4
+ :feed: http://www.readwriteweb.com/rss.xml
5
+ :briancray:
6
+ :title: 5 great examples of popular blog posts that you should know
7
+ :feed: http://feeds.feedburner.com/briancray/blog
8
+ :lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
9
+ :huffington:
10
+ :title: Afghans Losing Hope After 8 Years Of War
11
+ :author: TODD PITMAN
12
+ :feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
13
+ :lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
14
+ :bbcnews:
15
+ :title: Gay Muslims made homeless by family violence
16
+ :author: Poonam Taneja
17
+ :description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
18
+ :lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
19
+ :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
20
+ :factor:
21
+ :title: Factor's bootstrap process explained
22
+ :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
23
+ :youtube:
24
+ :title: YMO - Rydeen (Official Video)
25
+ :author: ymo1965
26
+ :spolsky:
27
+ :title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
28
+ :description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
29
+ :author: Joel Spolsky
30
+ :favicon: /favicon.ico
31
+ :feed: http://www.joelonsoftware.com/rss.xml
32
+ :techcrunch:
33
+ :title: Googlle Gets A Sexy New Logo; Remains Sketchy
34
+ :author: MG Siegler
35
+ :keywords:
36
+ - - googlle
37
+ - 35
38
+ - - logo
39
+ - 10
40
+ - - google
41
+ - 6
42
+ - - site
43
+ - 3
44
+ - - font
45
+ - 2
46
+ - - india
47
+ - 2
48
+ - - surprised
49
+ - 1
50
+ - - week
51
+ - 1
52
+ - - switched
53
+ - 1
54
+ - - school
55
+ - 1
56
+ - - things
57
+ - 1
58
+ - - removing
59
+ - 1
60
+ - - steve
61
+ - 1
62
+ - - decided
63
+ - 1
64
+ - - advantage
65
+ - 1
66
+ - - wasn
67
+ - 1
68
+ - - accepting
69
+ - 1
70
+ - - red
71
+ - 1
72
+ - - copy
73
+ - 1
74
+ - - wouldn
75
+ - 1
76
+ :rubyinside:
77
+ :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
78
+ :author: Peter Cooper
79
+ :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
80
+ :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
81
+ :feed: http://www.rubyinside.com/feed/
82
+ :keywords:
83
+ - - ruby
84
+ - 15
85
+ - - coffeescript
86
+ - 15
87
+ - - compiler
88
+ - 10
89
+ - - language
90
+ - 10
91
+ - - coffee
92
+ - 5
93
+ - - pure
94
+ - 5
95
+ - - code
96
+ - 5
97
+ - - script
98
+ - 5
99
+ - - javascript
100
+ - 3
101
+ - - github
102
+ - 2
103
+ - - syntax
104
+ - 1
105
+ - - programming
106
+ - 1
107
+ - - brother
108
+ - 1
109
+ - - constructs
110
+ - 1
111
+ - - vanilla
112
+ - 1
113
+ - - parser
114
+ - 1
115
+ - - lexer
116
+ - 1
117
+ - - project
118
+ - 1
119
+ - - installed
120
+ - 1
121
+ - - simple
122
+ - 1
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Cooper
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-03-26 00:00:00 +00:00
12
+ date: 2010-05-03 00:00:00 +01:00
13
13
  default_executable: pismo
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -102,6 +102,7 @@ files:
102
102
  - test/corpus/factor.html
103
103
  - test/corpus/huffington.html
104
104
  - test/corpus/metadata_expected.yaml
105
+ - test/corpus/metadata_expected.yaml.old
105
106
  - test/corpus/rubyinside.html
106
107
  - test/corpus/rww.html
107
108
  - test/corpus/spolsky.html