pismo 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +25 -20
- data/VERSION +1 -1
- data/lib/pismo/document.rb +3 -3
- data/lib/pismo/internal_attributes.rb +127 -47
- data/lib/pismo/readability.rb +6 -1
- data/lib/pismo/stopwords.txt +452 -326
- data/lib/pismo.rb +10 -4
- data/pismo.gemspec +2 -2
- data/test/corpus/metadata_expected.yaml +17 -0
- metadata +2 -2
data/lib/pismo.rb
CHANGED
@@ -4,6 +4,7 @@ require 'open-uri'
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'fast_stemmer'
|
6
6
|
require 'chronic'
|
7
|
+
require 'tempfile'
|
7
8
|
|
8
9
|
$: << File.dirname(__FILE__)
|
9
10
|
require 'pismo/document'
|
@@ -28,8 +29,9 @@ class Nokogiri::HTML::Document
|
|
28
29
|
self.search(search).first rescue nil
|
29
30
|
end
|
30
31
|
|
31
|
-
def match(
|
32
|
-
|
32
|
+
def match(queries = [], all = false)
|
33
|
+
r = [] if all
|
34
|
+
[*queries].each do |query|
|
33
35
|
if query.is_a?(String)
|
34
36
|
result = self.search(query).first.inner_text.strip rescue nil
|
35
37
|
elsif query.is_a?(Array)
|
@@ -41,9 +43,13 @@ class Nokogiri::HTML::Document
|
|
41
43
|
# result.gsub!(/\342\200\224/, '-')
|
42
44
|
result.gsub!('’', '\'')
|
43
45
|
result.gsub!('—', '-')
|
44
|
-
|
46
|
+
if all
|
47
|
+
r << result
|
48
|
+
else
|
49
|
+
return result
|
50
|
+
end
|
45
51
|
end
|
46
52
|
end
|
47
|
-
|
53
|
+
all && !r.empty? ? r : nil
|
48
54
|
end
|
49
55
|
end
|
data/pismo.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{pismo}
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.4.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Peter Cooper"]
|
12
|
-
s.date = %q{2010-05-
|
12
|
+
s.date = %q{2010-05-15}
|
13
13
|
s.default_executable = %q{pismo}
|
14
14
|
s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
|
15
15
|
s.email = %q{git@peterc.org}
|
@@ -2,6 +2,9 @@
|
|
2
2
|
:rww:
|
3
3
|
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
4
|
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
+
:feeds:
|
6
|
+
- http://www.readwriteweb.com/rss.xml
|
7
|
+
- http://www.readwriteweb.com/archives/2010/01/cartoon_apple_tablet_now_with_barometer_and_bird_c.xml
|
5
8
|
:briancray:
|
6
9
|
:title: 5 great examples of popular blog posts that you should know
|
7
10
|
:feed: http://feeds.feedburner.com/briancray/blog
|
@@ -9,23 +12,37 @@
|
|
9
12
|
:huffington:
|
10
13
|
:title: Afghans Losing Hope After 8 Years Of War
|
11
14
|
:author: TODD PITMAN
|
15
|
+
:authors:
|
16
|
+
- TODD PITMAN
|
17
|
+
- AP
|
12
18
|
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
19
|
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
20
|
:bbcnews:
|
15
21
|
:title: Gay Muslims made homeless by family violence
|
22
|
+
:titles:
|
23
|
+
- Gay Muslims made homeless by family violence
|
16
24
|
:author: Poonam Taneja
|
25
|
+
:authors:
|
26
|
+
- Poonam Taneja
|
17
27
|
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
28
|
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
29
|
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
30
|
:factor:
|
21
31
|
:title: Factor's bootstrap process explained
|
22
32
|
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
33
|
+
:ledes:
|
34
|
+
- "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
35
|
+
- Slava Pestov's weblog, primarily about Factor.
|
23
36
|
:youtube:
|
24
37
|
:title: YMO - Rydeen (Official Video)
|
25
38
|
:author: ymo1965
|
39
|
+
:authors:
|
40
|
+
- ymo1965
|
26
41
|
:spolsky:
|
27
42
|
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
43
|
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
44
|
+
:ledes:
|
45
|
+
- Ever wonder about that mysterious Content-Type tag? You know, the one you're supposed to put in HTML and you never quite know what it should be?
|
29
46
|
:author: Joel Spolsky
|
30
47
|
:favicon: /favicon.ico
|
31
48
|
:feed: http://www.joelonsoftware.com/rss.xml
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Cooper
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-05-
|
12
|
+
date: 2010-05-15 00:00:00 +01:00
|
13
13
|
default_executable: pismo
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|