pismo 0.7.2 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,9 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
9
  $LOAD_PATH.unshift(File.dirname(__FILE__))
10
10
  require 'pismo'
11
11
 
12
+ # Set time zone to prevent parsed times not matching those stored in metadata_expected.yaml
13
+ ENV['TZ'] = 'UTC'
14
+
12
15
  class Test::Unit::TestCase
13
16
  include Pismo
14
17
  HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
@@ -11,6 +11,8 @@ class TestCorpus < Test::Unit::TestCase
11
11
  # Load the "expected metadata" ready for tests
12
12
  @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
13
  @reader_metadata = YAML.load(open(HTML_DIRECTORY + "/reader_expected.yaml"))
14
+ @readers = {}
15
+ Dir[HTML_DIRECTORY + "/readers/*_expected.yaml"].each { |filename| @readers[File.basename(filename).sub(/_expected\.yaml$/, '').to_sym] = File.read(filename) }
14
16
  end
15
17
 
16
18
  should "pass basic sanitization and result in Nokogiri documents" do
@@ -31,11 +33,22 @@ class TestCorpus < Test::Unit::TestCase
31
33
  end
32
34
  end
33
35
 
34
- should "pass content extraction tests" do
36
+ should "pass base reader content extraction tests" do
35
37
  @reader_metadata.each do |file, expected|
36
- @doc = Reader::Document.new(@corpus[file])
38
+ @doc = Reader::Document.create(@corpus[file])
37
39
  assert_equal expected, @doc.sentences(2)
38
40
  end
39
- end
41
+ end
42
+
43
+ should "pass reader content extraction tests" do
44
+ @readers.each do |reader, expected|
45
+ results = YAML.load(expected)
46
+ results.each_key do |file|
47
+ @doc = Document.new(@corpus[file], :reader => reader)
48
+ assert_equal results[file][0..1000], @doc.body[0..1000]
49
+ end
50
+ end
51
+ end
52
+
40
53
  end
41
54
  end
metadata CHANGED
@@ -1,110 +1,109 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 7
8
- - 2
9
- version: 0.7.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.4
10
5
  platform: ruby
11
- authors:
6
+ authors:
12
7
  - Peter Cooper
13
8
  autorequire:
14
9
  bindir: bin
15
10
  cert_chain: []
16
-
17
- date: 2010-07-27 00:00:00 +01:00
18
- default_executable: pismo
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
11
+ date: 2010-12-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
21
14
  name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
22
21
  prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- segments:
29
- - 0
30
- version: "0"
31
- type: :runtime
32
- version_requirements: *id001
33
- - !ruby/object:Gem::Dependency
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
34
28
  name: awesome_print
35
- prerelease: false
36
- requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ">="
40
- - !ruby/object:Gem::Version
41
- segments:
42
- - 0
43
- version: "0"
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
44
34
  type: :runtime
45
- version_requirements: *id002
46
- - !ruby/object:Gem::Dependency
47
- name: nokogiri
48
35
  prerelease: false
49
- requirement: &id003 !ruby/object:Gem::Requirement
50
- none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- segments:
55
- - 0
56
- version: "0"
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
57
48
  type: :runtime
58
- version_requirements: *id003
59
- - !ruby/object:Gem::Dependency
60
- name: sanitize
61
49
  prerelease: false
62
- requirement: &id004 !ruby/object:Gem::Requirement
63
- none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- segments:
68
- - 0
69
- version: "0"
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: sanitize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
70
62
  type: :runtime
71
- version_requirements: *id004
72
- - !ruby/object:Gem::Dependency
73
- name: fast-stemmer
74
63
  prerelease: false
75
- requirement: &id005 !ruby/object:Gem::Requirement
76
- none: false
77
- requirements:
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- segments:
81
- - 0
82
- version: "0"
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fast-stemmer
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
83
76
  type: :runtime
84
- version_requirements: *id005
85
- - !ruby/object:Gem::Dependency
86
- name: chronic
87
77
  prerelease: false
88
- requirement: &id006 !ruby/object:Gem::Requirement
89
- none: false
90
- requirements:
91
- - - ">="
92
- - !ruby/object:Gem::Version
93
- segments:
94
- - 0
95
- version: "0"
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: chronic
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
96
90
  type: :runtime
97
- version_requirements: *id006
98
- description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
99
- email:
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Pismo extracts and retrieves content-related metadata from HTML pages
98
+ - you can use the resulting data in an organized way, such as a summary/first paragraph,
99
+ body text, keywords, RSS feed URL, favicon, etc.
100
+ email:
100
101
  - git@peterc.org
101
- executables:
102
+ executables:
102
103
  - pismo
103
104
  extensions: []
104
-
105
105
  extra_rdoc_files: []
106
-
107
- files:
106
+ files:
108
107
  - .document
109
108
  - .gitignore
110
109
  - Gemfile
@@ -118,6 +117,9 @@ files:
118
117
  - lib/pismo/external_attributes.rb
119
118
  - lib/pismo/internal_attributes.rb
120
119
  - lib/pismo/reader.rb
120
+ - lib/pismo/reader/base.rb
121
+ - lib/pismo/reader/cluster.rb
122
+ - lib/pismo/reader/tree.rb
121
123
  - lib/pismo/stopwords.txt
122
124
  - lib/pismo/version.rb
123
125
  - pismo.gemspec
@@ -129,52 +131,45 @@ files:
129
131
  - test/corpus/gmane.html
130
132
  - test/corpus/huffington.html
131
133
  - test/corpus/metadata_expected.yaml
132
- - test/corpus/metadata_expected.yaml.old
133
134
  - test/corpus/queness.html
134
135
  - test/corpus/reader_expected.yaml
136
+ - test/corpus/readers/cluster_expected.yaml
137
+ - test/corpus/readers/tree_expected.yaml
135
138
  - test/corpus/rubyinside.html
136
139
  - test/corpus/rww.html
137
140
  - test/corpus/spolsky.html
138
141
  - test/corpus/techcrunch.html
142
+ - test/corpus/thegoodbookblog.html
139
143
  - test/corpus/tweet.html
140
144
  - test/corpus/youtube.html
141
145
  - test/corpus/zefrank.html
142
146
  - test/helper.rb
143
147
  - test/test_corpus.rb
144
148
  - test/test_pismo_document.rb
145
- has_rdoc: true
146
149
  homepage: http://github.com/peterc/pismo
147
150
  licenses: []
148
-
151
+ metadata: {}
149
152
  post_install_message:
150
153
  rdoc_options: []
151
-
152
- require_paths:
154
+ require_paths:
153
155
  - lib
154
- required_ruby_version: !ruby/object:Gem::Requirement
155
- none: false
156
- requirements:
157
- - - ">="
158
- - !ruby/object:Gem::Version
159
- segments:
160
- - 0
161
- version: "0"
162
- required_rubygems_version: !ruby/object:Gem::Requirement
163
- none: false
164
- requirements:
165
- - - ">="
166
- - !ruby/object:Gem::Version
167
- segments:
168
- - 0
169
- version: "0"
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - '>='
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ required_rubygems_version: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - '>='
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
170
166
  requirements: []
171
-
172
167
  rubyforge_project: pismo
173
- rubygems_version: 1.3.7
168
+ rubygems_version: 2.0.0
174
169
  signing_key:
175
- specification_version: 3
170
+ specification_version: 4
176
171
  summary: Extracts or retrieves content-related metadata from HTML pages
177
- test_files:
172
+ test_files:
178
173
  - test/corpus/bbcnews.html
179
174
  - test/corpus/bbcnews2.html
180
175
  - test/corpus/briancray.html
@@ -183,13 +178,15 @@ test_files:
183
178
  - test/corpus/gmane.html
184
179
  - test/corpus/huffington.html
185
180
  - test/corpus/metadata_expected.yaml
186
- - test/corpus/metadata_expected.yaml.old
187
181
  - test/corpus/queness.html
188
182
  - test/corpus/reader_expected.yaml
183
+ - test/corpus/readers/cluster_expected.yaml
184
+ - test/corpus/readers/tree_expected.yaml
189
185
  - test/corpus/rubyinside.html
190
186
  - test/corpus/rww.html
191
187
  - test/corpus/spolsky.html
192
188
  - test/corpus/techcrunch.html
189
+ - test/corpus/thegoodbookblog.html
193
190
  - test/corpus/tweet.html
194
191
  - test/corpus/youtube.html
195
192
  - test/corpus/zefrank.html
@@ -1,122 +0,0 @@
1
- ---
2
- :rww:
3
- :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
4
- :feed: http://www.readwriteweb.com/rss.xml
5
- :briancray:
6
- :title: 5 great examples of popular blog posts that you should know
7
- :feed: http://feeds.feedburner.com/briancray/blog
8
- :lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
9
- :huffington:
10
- :title: Afghans Losing Hope After 8 Years Of War
11
- :author: TODD PITMAN
12
- :feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
13
- :lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
14
- :bbcnews:
15
- :title: Gay Muslims made homeless by family violence
16
- :author: Poonam Taneja
17
- :description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
18
- :lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
19
- :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
20
- :factor:
21
- :title: Factor's bootstrap process explained
22
- :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
23
- :youtube:
24
- :title: YMO - Rydeen (Official Video)
25
- :author: ymo1965
26
- :spolsky:
27
- :title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
28
- :description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
29
- :author: Joel Spolsky
30
- :favicon: /favicon.ico
31
- :feed: http://www.joelonsoftware.com/rss.xml
32
- :techcrunch:
33
- :title: Googlle Gets A Sexy New Logo; Remains Sketchy
34
- :author: MG Siegler
35
- :keywords:
36
- - - googlle
37
- - 35
38
- - - logo
39
- - 10
40
- - - google
41
- - 6
42
- - - site
43
- - 3
44
- - - font
45
- - 2
46
- - - india
47
- - 2
48
- - - surprised
49
- - 1
50
- - - week
51
- - 1
52
- - - switched
53
- - 1
54
- - - school
55
- - 1
56
- - - things
57
- - 1
58
- - - removing
59
- - 1
60
- - - steve
61
- - 1
62
- - - decided
63
- - 1
64
- - - advantage
65
- - 1
66
- - - wasn
67
- - 1
68
- - - accepting
69
- - 1
70
- - - red
71
- - 1
72
- - - copy
73
- - 1
74
- - - wouldn
75
- - 1
76
- :rubyinside:
77
- :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
78
- :author: Peter Cooper
79
- :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
80
- :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
81
- :feed: http://www.rubyinside.com/feed/
82
- :keywords:
83
- - - ruby
84
- - 15
85
- - - coffeescript
86
- - 15
87
- - - compiler
88
- - 10
89
- - - language
90
- - 10
91
- - - coffee
92
- - 5
93
- - - pure
94
- - 5
95
- - - code
96
- - 5
97
- - - script
98
- - 5
99
- - - javascript
100
- - 3
101
- - - github
102
- - 2
103
- - - syntax
104
- - 1
105
- - - programming
106
- - 1
107
- - - brother
108
- - 1
109
- - - constructs
110
- - 1
111
- - - vanilla
112
- - 1
113
- - - parser
114
- - 1
115
- - - lexer
116
- - 1
117
- - - project
118
- - 1
119
- - - installed
120
- - 1
121
- - - simple
122
- - 1