pismo 0.2.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +25 -20
- data/VERSION +1 -1
- data/lib/pismo/document.rb +3 -3
- data/lib/pismo/internal_attributes.rb +127 -47
- data/lib/pismo/readability.rb +6 -1
- data/lib/pismo/stopwords.txt +452 -326
- data/lib/pismo.rb +10 -4
- data/pismo.gemspec +2 -2
- data/test/corpus/metadata_expected.yaml +17 -0
- metadata +2 -2
data/lib/pismo.rb
CHANGED
@@ -4,6 +4,7 @@ require 'open-uri'
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'fast_stemmer'
|
6
6
|
require 'chronic'
|
7
|
+
require 'tempfile'
|
7
8
|
|
8
9
|
$: << File.dirname(__FILE__)
|
9
10
|
require 'pismo/document'
|
@@ -28,8 +29,9 @@ class Nokogiri::HTML::Document
|
|
28
29
|
self.search(search).first rescue nil
|
29
30
|
end
|
30
31
|
|
31
|
-
def match(
|
32
|
-
|
32
|
+
def match(queries = [], all = false)
|
33
|
+
r = [] if all
|
34
|
+
[*queries].each do |query|
|
33
35
|
if query.is_a?(String)
|
34
36
|
result = self.search(query).first.inner_text.strip rescue nil
|
35
37
|
elsif query.is_a?(Array)
|
@@ -41,9 +43,13 @@ class Nokogiri::HTML::Document
|
|
41
43
|
# result.gsub!(/\342\200\224/, '-')
|
42
44
|
result.gsub!('’', '\'')
|
43
45
|
result.gsub!('—', '-')
|
44
|
-
|
46
|
+
if all
|
47
|
+
r << result
|
48
|
+
else
|
49
|
+
return result
|
50
|
+
end
|
45
51
|
end
|
46
52
|
end
|
47
|
-
|
53
|
+
all && !r.empty? ? r : nil
|
48
54
|
end
|
49
55
|
end
|
data/pismo.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
12
|
-
s.date = %q{2010-05-
|
12
|
+
s.date = %q{2010-05-15}
|
13
13
|
s.default_executable = %q{pismo}
|
14
14
|
s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
|
15
15
|
s.email = %q{git@peterc.org}
|
@@ -2,6 +2,9 @@
|
|
2
2
|
:rww:
|
3
3
|
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
4
|
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
+
:feeds:
|
6
|
+
- http://www.readwriteweb.com/rss.xml
|
7
|
+
- http://www.readwriteweb.com/archives/2010/01/cartoon_apple_tablet_now_with_barometer_and_bird_c.xml
|
5
8
|
:briancray:
|
6
9
|
:title: 5 great examples of popular blog posts that you should know
|
7
10
|
:feed: http://feeds.feedburner.com/briancray/blog
|
@@ -9,23 +12,37 @@
|
|
9
12
|
:huffington:
|
10
13
|
:title: Afghans Losing Hope After 8 Years Of War
|
11
14
|
:author: TODD PITMAN
|
15
|
+
:authors:
|
16
|
+
- TODD PITMAN
|
17
|
+
- AP
|
12
18
|
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
19
|
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
20
|
:bbcnews:
|
15
21
|
:title: Gay Muslims made homeless by family violence
|
22
|
+
:titles:
|
23
|
+
- Gay Muslims made homeless by family violence
|
16
24
|
:author: Poonam Taneja
|
25
|
+
:authors:
|
26
|
+
- Poonam Taneja
|
17
27
|
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
28
|
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
29
|
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
30
|
:factor:
|
21
31
|
:title: Factor's bootstrap process explained
|
22
32
|
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
33
|
+
:ledes:
|
34
|
+
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
35
|
+
- Slava Pestov's weblog, primarily about Factor.
|
23
36
|
:youtube:
|
24
37
|
:title: YMO - Rydeen (Official Video)
|
25
38
|
:author: ymo1965
|
39
|
+
:authors:
|
40
|
+
- ymo1965
|
26
41
|
:spolsky:
|
27
42
|
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
43
|
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
44
|
+
:ledes:
|
45
|
+
- Ever wonder about that mysterious Content-Type tag? You know, the one you're supposed to put in HTML and you never quite know what it should be?
|
29
46
|
:author: Joel Spolsky
|
30
47
|
:favicon: /favicon.ico
|
31
48
|
:feed: http://www.joelonsoftware.com/rss.xml
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Cooper
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-05-
|
12
|
+
date: 2010-05-15 00:00:00 +01:00
|
13
13
|
default_executable: pismo
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|