pismo 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.2
1
+ 0.2.3
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'open-uri'
2
4
  require 'nokogiri'
3
5
  require 'fast_stemmer'
@@ -33,9 +35,12 @@ class Nokogiri::HTML::Document
33
35
  elsif query.is_a?(Array)
34
36
  result = query[1].call(self.search(query.first).first).strip rescue nil
35
37
  end
38
+
36
39
  if result
37
- result.gsub!(/\342\200\231/, '\'')
38
- result.gsub!(/\342\200\224/, '-')
40
+ # result.gsub!(/\342\200\231/, '\'')
41
+ # result.gsub!(/\342\200\224/, '-')
42
+ result.gsub!('’', '\'')
43
+ result.gsub!('—', '-')
39
44
  return result
40
45
  end
41
46
  end
@@ -3,6 +3,7 @@ module Pismo
3
3
  module InternalAttributes
4
4
  # Returns the title of the page/content - attempts to strip site name, etc, if possible
5
5
  def title
6
+ # TODO: Memoizations
6
7
  title = @doc.match( 'h2.title',
7
8
  '.entry h2', # Common style
8
9
  '.entryheader h1', # Ruby Inside/Kubrick
@@ -147,12 +148,13 @@ module Pismo
147
148
 
148
149
  words = {}
149
150
 
150
- # Convert doc to lowercase, scrub out most HTML tags
151
- body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\#\.]*\b/).each do |word|
151
+ # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
152
+ cached_title = title
153
+ body.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\&\w+\;/, '').scan(/\b[a-z][a-z\'\+\#\.]*\b/).each do |word|
152
154
  next if word.length > options[:word_length_limit]
153
155
  word.gsub!(/\'\w+/, '')
154
156
  words[word] ||= 0
155
- words[word] += 1
157
+ words[word] += (cached_title =~ /#{word}/i ? 5 : 1)
156
158
  end
157
159
 
158
160
  # Stem the words and stop words if necessary
@@ -168,8 +170,8 @@ module Pismo
168
170
  @body ||= Readability::Document.new(@doc.to_s).content.strip
169
171
 
170
172
  # HACK: Remove annoying DIV that readability leaves around
171
- @body.gsub!(/\A\<div\>/, '')
172
- @body.gsub!(/\<\/div\>\Z/, '')
173
+ @body.sub!(/\A\<div\>/, '')
174
+ @body.sub!(/\<\/div\>\Z/, '')
173
175
 
174
176
  return @body
175
177
  end
@@ -174,13 +174,13 @@ module Readability
174
174
  def score_node(elem)
175
175
  content_score = class_weight(elem)
176
176
  case elem.name.downcase
177
- when "div":
177
+ when "div"
178
178
  content_score += 5
179
- when "blockquote":
179
+ when "blockquote"
180
180
  content_score += 3
181
- when "form":
181
+ when "form"
182
182
  content_score -= 3
183
- when "th":
183
+ when "th"
184
184
  content_score -= 5
185
185
  end
186
186
  { :content_score => content_score, :elem => elem }
@@ -890,4 +890,7 @@ Erin
890
890
  Nevaeh
891
891
  Brooklyn
892
892
  Marissa
893
- yeah
893
+ yeah
894
+ covering
895
+ kid
896
+ notably
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{pismo}
8
- s.version = "0.2.2"
8
+ s.version = "0.2.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Peter Cooper"]
12
- s.date = %q{2010-03-26}
12
+ s.date = %q{2010-05-03}
13
13
  s.default_executable = %q{pismo}
14
14
  s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
15
15
  s.email = %q{git@peterc.org}
@@ -39,6 +39,7 @@ Gem::Specification.new do |s|
39
39
  "test/corpus/factor.html",
40
40
  "test/corpus/huffington.html",
41
41
  "test/corpus/metadata_expected.yaml",
42
+ "test/corpus/metadata_expected.yaml.old",
42
43
  "test/corpus/rubyinside.html",
43
44
  "test/corpus/rww.html",
44
45
  "test/corpus/spolsky.html",
@@ -1,10 +1,4 @@
1
1
  ---
2
- :rubyinside:
3
- :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
4
- :author: Peter Cooper
5
- :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
6
- :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
7
- :feed: http://www.rubyinside.com/feed/
8
2
  :rww:
9
3
  :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
10
4
  :feed: http://www.readwriteweb.com/rss.xml
@@ -38,44 +32,9 @@
38
32
  :techcrunch:
39
33
  :title: Googlle Gets A Sexy New Logo; Remains Sketchy
40
34
  :author: MG Siegler
41
- :keywords:
42
- - - googlle
43
- - 7
44
- - - google
45
- - 6
46
- - - site
47
- - 3
48
- - - logo
49
- - 2
50
- - - font
51
- - 2
52
- - - india
53
- - 2
54
- - - surprised
55
- - 1
56
- - - week
57
- - 1
58
- - - switched
59
- - 1
60
- - - school
61
- - 1
62
- - - things
63
- - 1
64
- - - removing
65
- - 1
66
- - - steve
67
- - 1
68
- - - decided
69
- - 1
70
- - - advantage
71
- - 1
72
- - - wasn
73
- - 1
74
- - - accepting
75
- - 1
76
- - - red
77
- - 1
78
- - - copy
79
- - 1
80
- - - wouldn
81
- - 1
35
+ :rubyinside:
36
+ :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
37
+ :author: Peter Cooper
38
+ :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
39
+ :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
40
+ :feed: http://www.rubyinside.com/feed/
@@ -0,0 +1,122 @@
1
+ ---
2
+ :rww:
3
+ :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
4
+ :feed: http://www.readwriteweb.com/rss.xml
5
+ :briancray:
6
+ :title: 5 great examples of popular blog posts that you should know
7
+ :feed: http://feeds.feedburner.com/briancray/blog
8
+ :lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
9
+ :huffington:
10
+ :title: Afghans Losing Hope After 8 Years Of War
11
+ :author: TODD PITMAN
12
+ :feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
13
+ :lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
14
+ :bbcnews:
15
+ :title: Gay Muslims made homeless by family violence
16
+ :author: Poonam Taneja
17
+ :description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
18
+ :lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
19
+ :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
20
+ :factor:
21
+ :title: Factor's bootstrap process explained
22
+ :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
23
+ :youtube:
24
+ :title: YMO - Rydeen (Official Video)
25
+ :author: ymo1965
26
+ :spolsky:
27
+ :title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
28
+ :description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
29
+ :author: Joel Spolsky
30
+ :favicon: /favicon.ico
31
+ :feed: http://www.joelonsoftware.com/rss.xml
32
+ :techcrunch:
33
+ :title: Googlle Gets A Sexy New Logo; Remains Sketchy
34
+ :author: MG Siegler
35
+ :keywords:
36
+ - - googlle
37
+ - 35
38
+ - - logo
39
+ - 10
40
+ - - google
41
+ - 6
42
+ - - site
43
+ - 3
44
+ - - font
45
+ - 2
46
+ - - india
47
+ - 2
48
+ - - surprised
49
+ - 1
50
+ - - week
51
+ - 1
52
+ - - switched
53
+ - 1
54
+ - - school
55
+ - 1
56
+ - - things
57
+ - 1
58
+ - - removing
59
+ - 1
60
+ - - steve
61
+ - 1
62
+ - - decided
63
+ - 1
64
+ - - advantage
65
+ - 1
66
+ - - wasn
67
+ - 1
68
+ - - accepting
69
+ - 1
70
+ - - red
71
+ - 1
72
+ - - copy
73
+ - 1
74
+ - - wouldn
75
+ - 1
76
+ :rubyinside:
77
+ :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
78
+ :author: Peter Cooper
79
+ :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
80
+ :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
81
+ :feed: http://www.rubyinside.com/feed/
82
+ :keywords:
83
+ - - ruby
84
+ - 15
85
+ - - coffeescript
86
+ - 15
87
+ - - compiler
88
+ - 10
89
+ - - language
90
+ - 10
91
+ - - coffee
92
+ - 5
93
+ - - pure
94
+ - 5
95
+ - - code
96
+ - 5
97
+ - - script
98
+ - 5
99
+ - - javascript
100
+ - 3
101
+ - - github
102
+ - 2
103
+ - - syntax
104
+ - 1
105
+ - - programming
106
+ - 1
107
+ - - brother
108
+ - 1
109
+ - - constructs
110
+ - 1
111
+ - - vanilla
112
+ - 1
113
+ - - parser
114
+ - 1
115
+ - - lexer
116
+ - 1
117
+ - - project
118
+ - 1
119
+ - - installed
120
+ - 1
121
+ - - simple
122
+ - 1
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Cooper
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-03-26 00:00:00 +00:00
12
+ date: 2010-05-03 00:00:00 +01:00
13
13
  default_executable: pismo
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -102,6 +102,7 @@ files:
102
102
  - test/corpus/factor.html
103
103
  - test/corpus/huffington.html
104
104
  - test/corpus/metadata_expected.yaml
105
+ - test/corpus/metadata_expected.yaml.old
105
106
  - test/corpus/rubyinside.html
106
107
  - test/corpus/rww.html
107
108
  - test/corpus/spolsky.html