pismo 0.7.2 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/LICENSE +1 -1
- data/NOTICE +1 -1
- data/README.markdown +20 -7
- data/Rakefile +0 -23
- data/lib/pismo.rb +6 -3
- data/lib/pismo/document.rb +8 -3
- data/lib/pismo/internal_attributes.rb +38 -6
- data/lib/pismo/reader.rb +10 -394
- data/lib/pismo/reader/base.rb +261 -0
- data/lib/pismo/reader/cluster.rb +171 -0
- data/lib/pismo/reader/tree.rb +154 -0
- data/lib/pismo/version.rb +1 -1
- data/pismo.gemspec +2 -3
- data/test/corpus/metadata_expected.yaml +8 -2
- data/test/corpus/readers/cluster_expected.yaml +45 -0
- data/test/corpus/readers/tree_expected.yaml +55 -0
- data/test/corpus/thegoodbookblog.html +612 -0
- data/test/helper.rb +3 -0
- data/test/test_corpus.rb +16 -3
- metadata +108 -111
- data/test/corpus/metadata_expected.yaml.old +0 -122
data/test/helper.rb
CHANGED
@@ -9,6 +9,9 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
9
9
|
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
10
10
|
require 'pismo'
|
11
11
|
|
12
|
+
# Set time zone to prevent parsed times not matching those stored in metadata_expected.yaml
|
13
|
+
ENV['TZ'] = 'UTC'
|
14
|
+
|
12
15
|
class Test::Unit::TestCase
|
13
16
|
include Pismo
|
14
17
|
HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
|
data/test/test_corpus.rb
CHANGED
@@ -11,6 +11,8 @@ class TestCorpus < Test::Unit::TestCase
|
|
11
11
|
# Load the "expected metadata" ready for tests
|
12
12
|
@metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
|
13
13
|
@reader_metadata = YAML.load(open(HTML_DIRECTORY + "/reader_expected.yaml"))
|
14
|
+
@readers = {}
|
15
|
+
Dir[HTML_DIRECTORY + "/readers/*_expected.yaml"].each { |filename| @readers[File.basename(filename).sub(/_expected\.yaml$/, '').to_sym] = File.read(filename) }
|
14
16
|
end
|
15
17
|
|
16
18
|
should "pass basic sanitization and result in Nokogiri documents" do
|
@@ -31,11 +33,22 @@ class TestCorpus < Test::Unit::TestCase
|
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
|
-
should "pass content extraction tests" do
|
36
|
+
should "pass base reader content extraction tests" do
|
35
37
|
@reader_metadata.each do |file, expected|
|
36
|
-
@doc = Reader::Document.
|
38
|
+
@doc = Reader::Document.create(@corpus[file])
|
37
39
|
assert_equal expected, @doc.sentences(2)
|
38
40
|
end
|
39
|
-
end
|
41
|
+
end
|
42
|
+
|
43
|
+
should "pass reader content extraction tests" do
|
44
|
+
@readers.each do |reader, expected|
|
45
|
+
results = YAML.load(expected)
|
46
|
+
results.each_key do |file|
|
47
|
+
@doc = Document.new(@corpus[file], :reader => reader)
|
48
|
+
assert_equal results[file][0..1000], @doc.body[0..1000]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
40
53
|
end
|
41
54
|
end
|
metadata
CHANGED
@@ -1,110 +1,109 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: pismo
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
segments:
|
6
|
-
- 0
|
7
|
-
- 7
|
8
|
-
- 2
|
9
|
-
version: 0.7.2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.4
|
10
5
|
platform: ruby
|
11
|
-
authors:
|
6
|
+
authors:
|
12
7
|
- Peter Cooper
|
13
8
|
autorequire:
|
14
9
|
bindir: bin
|
15
10
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2010-12-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
21
14
|
name: shoulda
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
22
21
|
prerelease: false
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
- 0
|
30
|
-
version: "0"
|
31
|
-
type: :runtime
|
32
|
-
version_requirements: *id001
|
33
|
-
- !ruby/object:Gem::Dependency
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
name: awesome_print
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
segments:
|
42
|
-
- 0
|
43
|
-
version: "0"
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
44
34
|
type: :runtime
|
45
|
-
version_requirements: *id002
|
46
|
-
- !ruby/object:Gem::Dependency
|
47
|
-
name: nokogiri
|
48
35
|
prerelease: false
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
57
48
|
type: :runtime
|
58
|
-
version_requirements: *id003
|
59
|
-
- !ruby/object:Gem::Dependency
|
60
|
-
name: sanitize
|
61
49
|
prerelease: false
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: sanitize
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
70
62
|
type: :runtime
|
71
|
-
version_requirements: *id004
|
72
|
-
- !ruby/object:Gem::Dependency
|
73
|
-
name: fast-stemmer
|
74
63
|
prerelease: false
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: fast-stemmer
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
83
76
|
type: :runtime
|
84
|
-
version_requirements: *id005
|
85
|
-
- !ruby/object:Gem::Dependency
|
86
|
-
name: chronic
|
87
77
|
prerelease: false
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: chronic
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
96
90
|
type: :runtime
|
97
|
-
|
98
|
-
|
99
|
-
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Pismo extracts and retrieves content-related metadata from HTML pages
|
98
|
+
- you can use the resulting data in an organized way, such as a summary/first paragraph,
|
99
|
+
body text, keywords, RSS feed URL, favicon, etc.
|
100
|
+
email:
|
100
101
|
- git@peterc.org
|
101
|
-
executables:
|
102
|
+
executables:
|
102
103
|
- pismo
|
103
104
|
extensions: []
|
104
|
-
|
105
105
|
extra_rdoc_files: []
|
106
|
-
|
107
|
-
files:
|
106
|
+
files:
|
108
107
|
- .document
|
109
108
|
- .gitignore
|
110
109
|
- Gemfile
|
@@ -118,6 +117,9 @@ files:
|
|
118
117
|
- lib/pismo/external_attributes.rb
|
119
118
|
- lib/pismo/internal_attributes.rb
|
120
119
|
- lib/pismo/reader.rb
|
120
|
+
- lib/pismo/reader/base.rb
|
121
|
+
- lib/pismo/reader/cluster.rb
|
122
|
+
- lib/pismo/reader/tree.rb
|
121
123
|
- lib/pismo/stopwords.txt
|
122
124
|
- lib/pismo/version.rb
|
123
125
|
- pismo.gemspec
|
@@ -129,52 +131,45 @@ files:
|
|
129
131
|
- test/corpus/gmane.html
|
130
132
|
- test/corpus/huffington.html
|
131
133
|
- test/corpus/metadata_expected.yaml
|
132
|
-
- test/corpus/metadata_expected.yaml.old
|
133
134
|
- test/corpus/queness.html
|
134
135
|
- test/corpus/reader_expected.yaml
|
136
|
+
- test/corpus/readers/cluster_expected.yaml
|
137
|
+
- test/corpus/readers/tree_expected.yaml
|
135
138
|
- test/corpus/rubyinside.html
|
136
139
|
- test/corpus/rww.html
|
137
140
|
- test/corpus/spolsky.html
|
138
141
|
- test/corpus/techcrunch.html
|
142
|
+
- test/corpus/thegoodbookblog.html
|
139
143
|
- test/corpus/tweet.html
|
140
144
|
- test/corpus/youtube.html
|
141
145
|
- test/corpus/zefrank.html
|
142
146
|
- test/helper.rb
|
143
147
|
- test/test_corpus.rb
|
144
148
|
- test/test_pismo_document.rb
|
145
|
-
has_rdoc: true
|
146
149
|
homepage: http://github.com/peterc/pismo
|
147
150
|
licenses: []
|
148
|
-
|
151
|
+
metadata: {}
|
149
152
|
post_install_message:
|
150
153
|
rdoc_options: []
|
151
|
-
|
152
|
-
require_paths:
|
154
|
+
require_paths:
|
153
155
|
- lib
|
154
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
requirements:
|
165
|
-
- - ">="
|
166
|
-
- !ruby/object:Gem::Version
|
167
|
-
segments:
|
168
|
-
- 0
|
169
|
-
version: "0"
|
156
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
157
|
+
requirements:
|
158
|
+
- - '>='
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: '0'
|
161
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - '>='
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
170
166
|
requirements: []
|
171
|
-
|
172
167
|
rubyforge_project: pismo
|
173
|
-
rubygems_version:
|
168
|
+
rubygems_version: 2.0.0
|
174
169
|
signing_key:
|
175
|
-
specification_version:
|
170
|
+
specification_version: 4
|
176
171
|
summary: Extracts or retrieves content-related metadata from HTML pages
|
177
|
-
test_files:
|
172
|
+
test_files:
|
178
173
|
- test/corpus/bbcnews.html
|
179
174
|
- test/corpus/bbcnews2.html
|
180
175
|
- test/corpus/briancray.html
|
@@ -183,13 +178,15 @@ test_files:
|
|
183
178
|
- test/corpus/gmane.html
|
184
179
|
- test/corpus/huffington.html
|
185
180
|
- test/corpus/metadata_expected.yaml
|
186
|
-
- test/corpus/metadata_expected.yaml.old
|
187
181
|
- test/corpus/queness.html
|
188
182
|
- test/corpus/reader_expected.yaml
|
183
|
+
- test/corpus/readers/cluster_expected.yaml
|
184
|
+
- test/corpus/readers/tree_expected.yaml
|
189
185
|
- test/corpus/rubyinside.html
|
190
186
|
- test/corpus/rww.html
|
191
187
|
- test/corpus/spolsky.html
|
192
188
|
- test/corpus/techcrunch.html
|
189
|
+
- test/corpus/thegoodbookblog.html
|
193
190
|
- test/corpus/tweet.html
|
194
191
|
- test/corpus/youtube.html
|
195
192
|
- test/corpus/zefrank.html
|
@@ -1,122 +0,0 @@
|
|
1
|
-
---
|
2
|
-
:rww:
|
3
|
-
:title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
|
4
|
-
:feed: http://www.readwriteweb.com/rss.xml
|
5
|
-
:briancray:
|
6
|
-
:title: 5 great examples of popular blog posts that you should know
|
7
|
-
:feed: http://feeds.feedburner.com/briancray/blog
|
8
|
-
:lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
|
9
|
-
:huffington:
|
10
|
-
:title: Afghans Losing Hope After 8 Years Of War
|
11
|
-
:author: TODD PITMAN
|
12
|
-
:feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
|
13
|
-
:lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
|
14
|
-
:bbcnews:
|
15
|
-
:title: Gay Muslims made homeless by family violence
|
16
|
-
:author: Poonam Taneja
|
17
|
-
:description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
|
18
|
-
:lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
|
19
|
-
:feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
|
20
|
-
:factor:
|
21
|
-
:title: Factor's bootstrap process explained
|
22
|
-
:lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
|
23
|
-
:youtube:
|
24
|
-
:title: YMO - Rydeen (Official Video)
|
25
|
-
:author: ymo1965
|
26
|
-
:spolsky:
|
27
|
-
:title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
|
28
|
-
:description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
|
29
|
-
:author: Joel Spolsky
|
30
|
-
:favicon: /favicon.ico
|
31
|
-
:feed: http://www.joelonsoftware.com/rss.xml
|
32
|
-
:techcrunch:
|
33
|
-
:title: Googlle Gets A Sexy New Logo; Remains Sketchy
|
34
|
-
:author: MG Siegler
|
35
|
-
:keywords:
|
36
|
-
- - googlle
|
37
|
-
- 35
|
38
|
-
- - logo
|
39
|
-
- 10
|
40
|
-
- - google
|
41
|
-
- 6
|
42
|
-
- - site
|
43
|
-
- 3
|
44
|
-
- - font
|
45
|
-
- 2
|
46
|
-
- - india
|
47
|
-
- 2
|
48
|
-
- - surprised
|
49
|
-
- 1
|
50
|
-
- - week
|
51
|
-
- 1
|
52
|
-
- - switched
|
53
|
-
- 1
|
54
|
-
- - school
|
55
|
-
- 1
|
56
|
-
- - things
|
57
|
-
- 1
|
58
|
-
- - removing
|
59
|
-
- 1
|
60
|
-
- - steve
|
61
|
-
- 1
|
62
|
-
- - decided
|
63
|
-
- 1
|
64
|
-
- - advantage
|
65
|
-
- 1
|
66
|
-
- - wasn
|
67
|
-
- 1
|
68
|
-
- - accepting
|
69
|
-
- 1
|
70
|
-
- - red
|
71
|
-
- 1
|
72
|
-
- - copy
|
73
|
-
- 1
|
74
|
-
- - wouldn
|
75
|
-
- 1
|
76
|
-
:rubyinside:
|
77
|
-
:title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
|
78
|
-
:author: Peter Cooper
|
79
|
-
:lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
|
80
|
-
:body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
|
81
|
-
:feed: http://www.rubyinside.com/feed/
|
82
|
-
:keywords:
|
83
|
-
- - ruby
|
84
|
-
- 15
|
85
|
-
- - coffeescript
|
86
|
-
- 15
|
87
|
-
- - compiler
|
88
|
-
- 10
|
89
|
-
- - language
|
90
|
-
- 10
|
91
|
-
- - coffee
|
92
|
-
- 5
|
93
|
-
- - pure
|
94
|
-
- 5
|
95
|
-
- - code
|
96
|
-
- 5
|
97
|
-
- - script
|
98
|
-
- 5
|
99
|
-
- - javascript
|
100
|
-
- 3
|
101
|
-
- - github
|
102
|
-
- 2
|
103
|
-
- - syntax
|
104
|
-
- 1
|
105
|
-
- - programming
|
106
|
-
- 1
|
107
|
-
- - brother
|
108
|
-
- 1
|
109
|
-
- - constructs
|
110
|
-
- 1
|
111
|
-
- - vanilla
|
112
|
-
- 1
|
113
|
-
- - parser
|
114
|
-
- 1
|
115
|
-
- - lexer
|
116
|
-
- 1
|
117
|
-
- - project
|
118
|
-
- 1
|
119
|
-
- - installed
|
120
|
-
- 1
|
121
|
-
- - simple
|
122
|
-
- 1
|