pismo 0.7.2 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,6 +9,9 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
9
  $LOAD_PATH.unshift(File.dirname(__FILE__))
10
10
  require 'pismo'
11
11
 
12
+ # Set time zone to prevent parsed times not matching those stored in metadata_expected.yaml
13
+ ENV['TZ'] = 'UTC'
14
+
12
15
  class Test::Unit::TestCase
13
16
  include Pismo
14
17
  HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
@@ -11,6 +11,8 @@ class TestCorpus < Test::Unit::TestCase
11
11
  # Load the "expected metadata" ready for tests
12
12
  @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
13
  @reader_metadata = YAML.load(open(HTML_DIRECTORY + "/reader_expected.yaml"))
14
+ @readers = {}
15
+ Dir[HTML_DIRECTORY + "/readers/*_expected.yaml"].each { |filename| @readers[File.basename(filename).sub(/_expected\.yaml$/, '').to_sym] = File.read(filename) }
14
16
  end
15
17
 
16
18
  should "pass basic sanitization and result in Nokogiri documents" do
@@ -31,11 +33,22 @@ class TestCorpus < Test::Unit::TestCase
31
33
  end
32
34
  end
33
35
 
34
- should "pass content extraction tests" do
36
+ should "pass base reader content extraction tests" do
35
37
  @reader_metadata.each do |file, expected|
36
- @doc = Reader::Document.new(@corpus[file])
38
+ @doc = Reader::Document.create(@corpus[file])
37
39
  assert_equal expected, @doc.sentences(2)
38
40
  end
39
- end
41
+ end
42
+
43
+ should "pass reader content extraction tests" do
44
+ @readers.each do |reader, expected|
45
+ results = YAML.load(expected)
46
+ results.each_key do |file|
47
+ @doc = Document.new(@corpus[file], :reader => reader)
48
+ assert_equal results[file][0..1000], @doc.body[0..1000]
49
+ end
50
+ end
51
+ end
52
+
40
53
  end
41
54
  end
metadata CHANGED
@@ -1,110 +1,109 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: pismo
3
- version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 7
8
- - 2
9
- version: 0.7.2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.4
10
5
  platform: ruby
11
- authors:
6
+ authors:
12
7
  - Peter Cooper
13
8
  autorequire:
14
9
  bindir: bin
15
10
  cert_chain: []
16
-
17
- date: 2010-07-27 00:00:00 +01:00
18
- default_executable: pismo
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
11
+ date: 2010-12-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
21
14
  name: shoulda
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
22
21
  prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- segments:
29
- - 0
30
- version: "0"
31
- type: :runtime
32
- version_requirements: *id001
33
- - !ruby/object:Gem::Dependency
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
34
28
  name: awesome_print
35
- prerelease: false
36
- requirement: &id002 !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ">="
40
- - !ruby/object:Gem::Version
41
- segments:
42
- - 0
43
- version: "0"
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
44
34
  type: :runtime
45
- version_requirements: *id002
46
- - !ruby/object:Gem::Dependency
47
- name: nokogiri
48
35
  prerelease: false
49
- requirement: &id003 !ruby/object:Gem::Requirement
50
- none: false
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- segments:
55
- - 0
56
- version: "0"
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
57
48
  type: :runtime
58
- version_requirements: *id003
59
- - !ruby/object:Gem::Dependency
60
- name: sanitize
61
49
  prerelease: false
62
- requirement: &id004 !ruby/object:Gem::Requirement
63
- none: false
64
- requirements:
65
- - - ">="
66
- - !ruby/object:Gem::Version
67
- segments:
68
- - 0
69
- version: "0"
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: sanitize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
70
62
  type: :runtime
71
- version_requirements: *id004
72
- - !ruby/object:Gem::Dependency
73
- name: fast-stemmer
74
63
  prerelease: false
75
- requirement: &id005 !ruby/object:Gem::Requirement
76
- none: false
77
- requirements:
78
- - - ">="
79
- - !ruby/object:Gem::Version
80
- segments:
81
- - 0
82
- version: "0"
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fast-stemmer
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
83
76
  type: :runtime
84
- version_requirements: *id005
85
- - !ruby/object:Gem::Dependency
86
- name: chronic
87
77
  prerelease: false
88
- requirement: &id006 !ruby/object:Gem::Requirement
89
- none: false
90
- requirements:
91
- - - ">="
92
- - !ruby/object:Gem::Version
93
- segments:
94
- - 0
95
- version: "0"
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: chronic
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
96
90
  type: :runtime
97
- version_requirements: *id006
98
- description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
99
- email:
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Pismo extracts and retrieves content-related metadata from HTML pages
98
+ - you can use the resulting data in an organized way, such as a summary/first paragraph,
99
+ body text, keywords, RSS feed URL, favicon, etc.
100
+ email:
100
101
  - git@peterc.org
101
- executables:
102
+ executables:
102
103
  - pismo
103
104
  extensions: []
104
-
105
105
  extra_rdoc_files: []
106
-
107
- files:
106
+ files:
108
107
  - .document
109
108
  - .gitignore
110
109
  - Gemfile
@@ -118,6 +117,9 @@ files:
118
117
  - lib/pismo/external_attributes.rb
119
118
  - lib/pismo/internal_attributes.rb
120
119
  - lib/pismo/reader.rb
120
+ - lib/pismo/reader/base.rb
121
+ - lib/pismo/reader/cluster.rb
122
+ - lib/pismo/reader/tree.rb
121
123
  - lib/pismo/stopwords.txt
122
124
  - lib/pismo/version.rb
123
125
  - pismo.gemspec
@@ -129,52 +131,45 @@ files:
129
131
  - test/corpus/gmane.html
130
132
  - test/corpus/huffington.html
131
133
  - test/corpus/metadata_expected.yaml
132
- - test/corpus/metadata_expected.yaml.old
133
134
  - test/corpus/queness.html
134
135
  - test/corpus/reader_expected.yaml
136
+ - test/corpus/readers/cluster_expected.yaml
137
+ - test/corpus/readers/tree_expected.yaml
135
138
  - test/corpus/rubyinside.html
136
139
  - test/corpus/rww.html
137
140
  - test/corpus/spolsky.html
138
141
  - test/corpus/techcrunch.html
142
+ - test/corpus/thegoodbookblog.html
139
143
  - test/corpus/tweet.html
140
144
  - test/corpus/youtube.html
141
145
  - test/corpus/zefrank.html
142
146
  - test/helper.rb
143
147
  - test/test_corpus.rb
144
148
  - test/test_pismo_document.rb
145
- has_rdoc: true
146
149
  homepage: http://github.com/peterc/pismo
147
150
  licenses: []
148
-
151
+ metadata: {}
149
152
  post_install_message:
150
153
  rdoc_options: []
151
-
152
- require_paths:
154
+ require_paths:
153
155
  - lib
154
- required_ruby_version: !ruby/object:Gem::Requirement
155
- none: false
156
- requirements:
157
- - - ">="
158
- - !ruby/object:Gem::Version
159
- segments:
160
- - 0
161
- version: "0"
162
- required_rubygems_version: !ruby/object:Gem::Requirement
163
- none: false
164
- requirements:
165
- - - ">="
166
- - !ruby/object:Gem::Version
167
- segments:
168
- - 0
169
- version: "0"
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ requirements:
158
+ - - '>='
159
+ - !ruby/object:Gem::Version
160
+ version: '0'
161
+ required_rubygems_version: !ruby/object:Gem::Requirement
162
+ requirements:
163
+ - - '>='
164
+ - !ruby/object:Gem::Version
165
+ version: '0'
170
166
  requirements: []
171
-
172
167
  rubyforge_project: pismo
173
- rubygems_version: 1.3.7
168
+ rubygems_version: 2.0.0
174
169
  signing_key:
175
- specification_version: 3
170
+ specification_version: 4
176
171
  summary: Extracts or retrieves content-related metadata from HTML pages
177
- test_files:
172
+ test_files:
178
173
  - test/corpus/bbcnews.html
179
174
  - test/corpus/bbcnews2.html
180
175
  - test/corpus/briancray.html
@@ -183,13 +178,15 @@ test_files:
183
178
  - test/corpus/gmane.html
184
179
  - test/corpus/huffington.html
185
180
  - test/corpus/metadata_expected.yaml
186
- - test/corpus/metadata_expected.yaml.old
187
181
  - test/corpus/queness.html
188
182
  - test/corpus/reader_expected.yaml
183
+ - test/corpus/readers/cluster_expected.yaml
184
+ - test/corpus/readers/tree_expected.yaml
189
185
  - test/corpus/rubyinside.html
190
186
  - test/corpus/rww.html
191
187
  - test/corpus/spolsky.html
192
188
  - test/corpus/techcrunch.html
189
+ - test/corpus/thegoodbookblog.html
193
190
  - test/corpus/tweet.html
194
191
  - test/corpus/youtube.html
195
192
  - test/corpus/zefrank.html
@@ -1,122 +0,0 @@
1
- ---
2
- :rww:
3
- :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
4
- :feed: http://www.readwriteweb.com/rss.xml
5
- :briancray:
6
- :title: 5 great examples of popular blog posts that you should know
7
- :feed: http://feeds.feedburner.com/briancray/blog
8
- :lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
9
- :huffington:
10
- :title: Afghans Losing Hope After 8 Years Of War
11
- :author: TODD PITMAN
12
- :feed: http://feeds.huffingtonpost.com/huffingtonpost/raw_feed
13
- :lede: "KABUL - The man on the motorcycle was going the wrong way down a one-way street, gesturing indignantly for the phalanx of traffic-clogged cars in front of him to move."
14
- :bbcnews:
15
- :title: Gay Muslims made homeless by family violence
16
- :author: Poonam Taneja
17
- :description: A charity is dealing with more gay Muslims made homeless after fleeing forced marriages and so-called "honour" violence.
18
- :lede: A UK charity is dealing with an increasing number of young gay Muslims becoming homeless after fleeing forced marriages and so-called honour violence.
19
- :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
20
- :factor:
21
- :title: Factor's bootstrap process explained
22
- :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. "
23
- :youtube:
24
- :title: YMO - Rydeen (Official Video)
25
- :author: ymo1965
26
- :spolsky:
27
- :title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!)
28
- :description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
29
- :author: Joel Spolsky
30
- :favicon: /favicon.ico
31
- :feed: http://www.joelonsoftware.com/rss.xml
32
- :techcrunch:
33
- :title: Googlle Gets A Sexy New Logo; Remains Sketchy
34
- :author: MG Siegler
35
- :keywords:
36
- - - googlle
37
- - 35
38
- - - logo
39
- - 10
40
- - - google
41
- - 6
42
- - - site
43
- - 3
44
- - - font
45
- - 2
46
- - - india
47
- - 2
48
- - - surprised
49
- - 1
50
- - - week
51
- - 1
52
- - - switched
53
- - 1
54
- - - school
55
- - 1
56
- - - things
57
- - 1
58
- - - removing
59
- - 1
60
- - - steve
61
- - 1
62
- - - decided
63
- - 1
64
- - - advantage
65
- - 1
66
- - - wasn
67
- - 1
68
- - - accepting
69
- - 1
70
- - - red
71
- - 1
72
- - - copy
73
- - 1
74
- - - wouldn
75
- - 1
76
- :rubyinside:
77
- :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
78
- :author: Peter Cooper
79
- :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
80
- :body: "CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it \"JavaScript's less ostentatious kid brother\" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.\nTo get a feel for the language, check out this example code (CoffeeScript on the left, resulting JavaScript on the right):\nAs a Ruby project, you can get the CoffeeScript compiler installed with a simple gem install coffee-script or check out the code from/on GitHub. The code is worth a look as it's notably quite vanilla with hand crafted Ruby covering the lexer and code generation and Racc built code for the parser.\n"
81
- :feed: http://www.rubyinside.com/feed/
82
- :keywords:
83
- - - ruby
84
- - 15
85
- - - coffeescript
86
- - 15
87
- - - compiler
88
- - 10
89
- - - language
90
- - 10
91
- - - coffee
92
- - 5
93
- - - pure
94
- - 5
95
- - - code
96
- - 5
97
- - - script
98
- - 5
99
- - - javascript
100
- - 3
101
- - - github
102
- - 2
103
- - - syntax
104
- - 1
105
- - - programming
106
- - 1
107
- - - brother
108
- - 1
109
- - - constructs
110
- - 1
111
- - - vanilla
112
- - 1
113
- - - parser
114
- - 1
115
- - - lexer
116
- - 1
117
- - - project
118
- - 1
119
- - - installed
120
- - 1
121
- - - simple
122
- - 1