pismo 0.7.2 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/LICENSE +1 -1
- data/NOTICE +1 -1
- data/README.markdown +20 -7
- data/Rakefile +0 -23
- data/lib/pismo.rb +6 -3
- data/lib/pismo/document.rb +8 -3
- data/lib/pismo/internal_attributes.rb +38 -6
- data/lib/pismo/reader.rb +10 -394
- data/lib/pismo/reader/base.rb +261 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/version.rb +1 -1
- data/pismo.gemspec +2 -3
- data/test/corpus/metadata_expected.yaml +8 -2
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/thegoodbookblog.html +612 -0
- data/test/helper.rb +3 -0
- data/test/test_corpus.rb +16 -3
- metadata +108 -111
- data/test/corpus/metadata_expected.yaml.old +0 -122
data/test/helper.rb
CHANGED
@@ -9,6 +9,9 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
9
9
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
10
10
|
require 'pismo'
|
11
11
|
|
12
|
+
# Set time zone to prevent parsed times not matching those stored in metadata_expected.yaml
|
13
|
+
ENV['TZ'] = 'UTC'
|
14
|
+
|
12
15
|
class Test::Unit::TestCase
|
13
16
|
include Pismo
|
14
17
|
HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
|
data/test/test_corpus.rb
CHANGED
@@ -11,6 +11,8 @@ class TestCorpus < Test::Unit::TestCase
|
|
11
11
|
# Load the "expected metadata" ready for tests
|
12
12
|
@metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
|
13
13
|
@reader_metadata = YAML.load(open(HTML_DIRECTORY + "/reader_expected.yaml"))
|
14
|
+
@readers = {}
|
15
|
+
Dir[HTML_DIRECTORY + "/readers/*_expected.yaml"].each { |filename| @readers[File.basename(filename).sub(/_expected\.yaml$/, '').to_sym] = File.read(filename) }
|
14
16
|
end
|
15
17
|
|
16
18
|
should "pass basic sanitization and result in Nokogiri documents" do
|
@@ -31,11 +33,22 @@ class TestCorpus < Test::Unit::TestCase
|
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
|
-
should "pass content extraction tests" do
|
36
|
+
should "pass base reader content extraction tests" do
|
35
37
|
@reader_metadata.each do |file, expected|
|
36
|
-
@doc = Reader::Document.
|
38
|
+
@doc = Reader::Document.create(@corpus[file])
|
37
39
|
assert_equal expected, @doc.sentences(2)
|
38
40
|
end
|
39
|
-
end
|
41
|
+
end
|
42
|
+
|
43
|
+
should "pass reader content extraction tests" do
|
44
|
+
@readers.each do |reader, expected|
|
45
|
+
results = YAML.load(expected)
|
46
|
+
results.each_key do |file|
|
47
|
+
@doc = Document.new(@corpus[file], :reader => reader)
|
48
|
+
assert_equal results[file][0..1000], @doc.body[0..1000]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
40
53
|
end
|
41
54
|
end
|
metadata
CHANGED
@@ -1,110 +1,109 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
segments:
|
6
|
-
- 0
|
7
|
-
- 7
|
8
|
-
- 2
|
9
|
-
version: 0.7.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.4
|
10
5
|
platform: ruby
|
11
|
-
authors:
|
6
|
+
authors:
|
12
7
|
- Peter Cooper
|
13
8
|
autorequire:
|
14
9
|
bindir: bin
|
15
10
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2010-12-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
21
14
|
name: shoulda
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
22
21
|
prerelease: false
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
- 0
|
30
|
-
version: "0"
|
31
|
-
type: :runtime
|
32
|
-
version_requirements: *id001
|
33
|
-
- !ruby/object:Gem::Dependency
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
name: awesome_print
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
segments:
|
42
|
-
- 0
|
43
|
-
version: "0"
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
44
34
|
type: :runtime
|
45
|
-
version_requirements: *id002
|
46
|
-
- !ruby/object:Gem::Dependency
|
47
|
-
name: nokogiri
|
48
35
|
prerelease: false
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
57
48
|
type: :runtime
|
58
|
-
version_requirements: *id003
|
59
|
-
- !ruby/object:Gem::Dependency
|
60
|
-
name: sanitize
|
61
49
|
prerelease: false
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: sanitize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
70
62
|
type: :runtime
|
71
|
-
version_requirements: *id004
|
72
|
-
- !ruby/object:Gem::Dependency
|
73
|
-
name: fast-stemmer
|
74
63
|
prerelease: false
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: fast-stemmer
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
83
76
|
type: :runtime
|
84
|
-
version_requirements: *id005
|
85
|
-
- !ruby/object:Gem::Dependency
|
86
|
-
name: chronic
|
87
77
|
prerelease: false
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: chronic
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
96
90
|
type: :runtime
|
97
|
-
|
98
|
-
|
99
|
-
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Pismo extracts and retrieves content-related metadata from HTML pages
|
98
|
+
- you can use the resulting data in an organized way, such as a summary/first paragraph,
|
99
|
+
body text, keywords, RSS feed URL, favicon, etc.
|
100
|
+
email:
|
100
101
|
- git@peterc.org
|
101
|
-
executables:
|
102
|
+
executables:
|
102
103
|
- pismo
|
103
104
|
extensions: []
|
104
|
-
|
105
105
|
extra_rdoc_files: []
|
106
|
-
|
107
|
-
files:
|
106
|
+
files:
|
108
107
|
- .document
|
109
108
|
- .gitignore
|
110
109
|
- Gemfile
|
@@ -118,6 +117,9 @@ files:
|
|
118
117
|
- lib/pismo/external_attributes.rb
|
119
118
|
- lib/pismo/internal_attributes.rb
|
120
119
|
- lib/pismo/reader.rb
|
120
|
+
- lib/pismo/reader/base.rb
|
121
|
+
- lib/pismo/reader/cluster.rb
|
122
|
+
- lib/pismo/reader/tree.rb
|
121
123
|
- lib/pismo/stopwords.txt
|
122
124
|
- lib/pismo/version.rb
|
123
125
|
- pismo.gemspec
|
@@ -129,52 +131,45 @@ files:
|
|
129
131
|
- test/corpus/gmane.html
|
130
132
|
- test/corpus/huffington.html
|
131
133
|
- test/corpus/metadata_expected.yaml
|
132
|
-
- test/corpus/metadata_expected.yaml.old
|
133
134
|
- test/corpus/queness.html
|
134
135
|
- test/corpus/reader_expected.yaml
|
136
|
+
- test/corpus/readers/cluster_expected.yaml
|
137
|
+
- test/corpus/readers/tree_expected.yaml
|
135
138
|
- test/corpus/rubyinside.html
|
136
139
|
- test/corpus/rww.html
|
137
140
|
- test/corpus/spolsky.html
|
138
141
|
- test/corpus/techcrunch.html
|
142
|
+
- test/corpus/thegoodbookblog.html
|
139
143
|
- test/corpus/tweet.html
|
140
144
|
- test/corpus/youtube.html
|
141
145
|
- test/corpus/zefrank.html
|
142
146
|
- test/helper.rb
|
143
147
|
- test/test_corpus.rb
|
144
148
|
- test/test_pismo_document.rb
|
145
|
-
has_rdoc: true
|
146
149
|
homepage: http://github.com/peterc/pismo
|
147
150
|
licenses: []
|
148
|
-
|
151
|
+
metadata: {}
|
149
152
|
post_install_message:
|
150
153
|
rdoc_options: []
|
151
|
-
|
152
|
-
require_paths:
|
154
|
+
require_paths:
|
153
155
|
- lib
|
154
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
requirements:
|
165
|
-
- - ">="
|
166
|
-
- !ruby/object:Gem::Version
|
167
|
-
segments:
|
168
|
-
- 0
|
169
|
-
version: "0"
|
156
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
157
|
+
requirements:
|
158
|
+
- - '>='
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
170
166
|
requirements: []
|
171
|
-
|
172
167
|
rubyforge_project: pismo
|
173
|
-
rubygems_version:
|
168
|
+
rubygems_version: 2.0.0
|
174
169
|
signing_key:
|
175
|
-
specification_version:
|
170
|
+
specification_version: 4
|
176
171
|
summary: Extracts or retrieves content-related metadata from HTML pages
|
177
|
-
test_files:
|
172
|
+
test_files:
|
178
173
|
- test/corpus/bbcnews.html
|
179
174
|
- test/corpus/bbcnews2.html
|
180
175
|
- test/corpus/briancray.html
|
@@ -183,13 +178,15 @@ test_files:
|
|
183
178
|
- test/corpus/gmane.html
|
184
179
|
- test/corpus/huffington.html
|
185
180
|
- test/corpus/metadata_expected.yaml
|
186
|
-
- test/corpus/metadata_expected.yaml.old
|
187
181
|
- test/corpus/queness.html
|
188
182
|
- test/corpus/reader_expected.yaml
|
183
|
+
- test/corpus/readers/cluster_expected.yaml
|
184
|
+
- test/corpus/readers/tree_expected.yaml
|
189
185
|
- test/corpus/rubyinside.html
|
190
186
|
- test/corpus/rww.html
|
191
187
|
- test/corpus/spolsky.html
|
192
188
|
- test/corpus/techcrunch.html
|
189
|
+
- test/corpus/thegoodbookblog.html
|
193
190
|
- test/corpus/tweet.html
|
194
191
|
- test/corpus/youtube.html
|
195
192
|
- test/corpus/zefrank.html
|
@@ -1,122 +0,0 @@
|
|
1
|
-
---
|
2
|
-
:rww:
|
3
|
-
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
|
-
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
-
:briancray:
|
6
|
-
:title: 5 great examples of popular blog posts that you should know
|
7
|
-
:feed: http://feeds.feedburner.com/briancray/blog
|
8
|
-
:lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
|
9
|
-
:huffington:
|
10
|
-
:title: Afghans Losing Hope After 8 Years Of War
|
11
|
-
:author: TODD PITMAN
|
12
|
-
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
|
-
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
|
-
:bbcnews:
|
15
|
-
:title: Gay Muslims made homeless by family violence
|
16
|
-
:author: Poonam Taneja
|
17
|
-
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
|
-
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
|
-
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
|
-
:factor:
|
21
|
-
:title: Factor's bootstrap process explained
|
22
|
-
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
23
|
-
:youtube:
|
24
|
-
:title: YMO - Rydeen (Official Video)
|
25
|
-
:author: ymo1965
|
26
|
-
:spolsky:
|
27
|
-
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
|
-
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
29
|
-
:author: Joel Spolsky
|
30
|
-
:favicon: /favicon.ico
|
31
|
-
:feed: http://www.joelonsoftware.com/rss.xml
|
32
|
-
:techcrunch:
|
33
|
-
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
34
|
-
:author: MG Siegler
|
35
|
-
:keywords:
|
36
|
-
- - googlle
|
37
|
-
- 35
|
38
|
-
- - logo
|
39
|
-
- 10
|
40
|
-
- - google
|
41
|
-
- 6
|
42
|
-
- - site
|
43
|
-
- 3
|
44
|
-
- - font
|
45
|
-
- 2
|
46
|
-
- - india
|
47
|
-
- 2
|
48
|
-
- - surprised
|
49
|
-
- 1
|
50
|
-
- - week
|
51
|
-
- 1
|
52
|
-
- - switched
|
53
|
-
- 1
|
54
|
-
- - school
|
55
|
-
- 1
|
56
|
-
- - things
|
57
|
-
- 1
|
58
|
-
- - removing
|
59
|
-
- 1
|
60
|
-
- - steve
|
61
|
-
- 1
|
62
|
-
- - decided
|
63
|
-
- 1
|
64
|
-
- - advantage
|
65
|
-
- 1
|
66
|
-
- - wasn
|
67
|
-
- 1
|
68
|
-
- - accepting
|
69
|
-
- 1
|
70
|
-
- - red
|
71
|
-
- 1
|
72
|
-
- - copy
|
73
|
-
- 1
|
74
|
-
- - wouldn
|
75
|
-
- 1
|
76
|
-
:rubyinside:
|
77
|
-
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
78
|
-
:author: Peter Cooper
|
79
|
-
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
80
|
-
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
81
|
-
:feed: http://www.rubyinside.com/feed/
|
82
|
-
:keywords:
|
83
|
-
- - ruby
|
84
|
-
- 15
|
85
|
-
- - coffeescript
|
86
|
-
- 15
|
87
|
-
- - compiler
|
88
|
-
- 10
|
89
|
-
- - language
|
90
|
-
- 10
|
91
|
-
- - coffee
|
92
|
-
- 5
|
93
|
-
- - pure
|
94
|
-
- 5
|
95
|
-
- - code
|
96
|
-
- 5
|
97
|
-
- - script
|
98
|
-
- 5
|
99
|
-
- - javascript
|
100
|
-
- 3
|
101
|
-
- - github
|
102
|
-
- 2
|
103
|
-
- - syntax
|
104
|
-
- 1
|
105
|
-
- - programming
|
106
|
-
- 1
|
107
|
-
- - brother
|
108
|
-
- 1
|
109
|
-
- - constructs
|
110
|
-
- 1
|
111
|
-
- - vanilla
|
112
|
-
- 1
|
113
|
-
- - parser
|
114
|
-
- 1
|
115
|
-
- - lexer
|
116
|
-
- 1
|
117
|
-
- - project
|
118
|
-
- 1
|
119
|
-
- - installed
|
120
|
-
- 1
|
121
|
-
- - simple
|
122
|
-
- 1
|