nddrylliog_pismo 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/.document +5 -0
  2. data/.gitignore +29 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE +23 -0
  5. data/NOTICE +4 -0
  6. data/README.markdown +131 -0
  7. data/Rakefile +72 -0
  8. data/bin/pismo +45 -0
  9. data/lib/pismo.rb +82 -0
  10. data/lib/pismo/document.rb +67 -0
  11. data/lib/pismo/external_attributes.rb +14 -0
  12. data/lib/pismo/internal_attributes.rb +316 -0
  13. data/lib/pismo/reader.rb +19 -0
  14. data/lib/pismo/reader/base.rb +259 -0
  15. data/lib/pismo/reader/cluster.rb +171 -0
  16. data/lib/pismo/reader/tree.rb +154 -0
  17. data/lib/pismo/stopwords.txt +1002 -0
  18. data/lib/pismo/version.rb +3 -0
  19. data/pismo.gemspec +30 -0
  20. data/test/corpus/bbcnews.html +2131 -0
  21. data/test/corpus/bbcnews2.html +1575 -0
  22. data/test/corpus/briancray.html +269 -0
  23. data/test/corpus/cant_read.html +426 -0
  24. data/test/corpus/factor.html +1362 -0
  25. data/test/corpus/gmane.html +138 -0
  26. data/test/corpus/huffington.html +2932 -0
  27. data/test/corpus/metadata_expected.yaml +72 -0
  28. data/test/corpus/metadata_expected.yaml.old +122 -0
  29. data/test/corpus/queness.html +919 -0
  30. data/test/corpus/reader_expected.yaml +39 -0
  31. data/test/corpus/readers/cluster_expected.yaml +45 -0
  32. data/test/corpus/readers/tree_expected.yaml +55 -0
  33. data/test/corpus/rubyinside.html +318 -0
  34. data/test/corpus/rww.html +1351 -0
  35. data/test/corpus/spolsky.html +298 -0
  36. data/test/corpus/techcrunch.html +1285 -0
  37. data/test/corpus/tweet.html +360 -0
  38. data/test/corpus/youtube.html +2348 -0
  39. data/test/corpus/zefrank.html +535 -0
  40. data/test/helper.rb +15 -0
  41. data/test/test_corpus.rb +54 -0
  42. data/test/test_pismo_document.rb +34 -0
  43. metadata +156 -0
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'open-uri'
5
+ require 'yaml'
6
+ begin; require 'turn'; rescue LoadError; end
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'pismo'
11
+
12
+ class Test::Unit::TestCase
13
+ include Pismo
14
+ HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
15
+ end
@@ -0,0 +1,54 @@
1
+ require 'helper'
2
+
3
+ class TestCorpus < Test::Unit::TestCase
4
+
5
+ context "A corpus of HTML documents" do
6
+ setup do
7
+ # Load the corpus files' HTML content into a hash
8
+ @corpus = {}
9
+ Dir[HTML_DIRECTORY + "/*.html"].each { |filename| @corpus[File.basename(filename).sub(/\.html$/, '').to_sym] = File.read(filename) }
10
+
11
+ # Load the "expected metadata" ready for tests
12
+ @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
+ @reader_metadata = YAML.load(open(HTML_DIRECTORY + "/reader_expected.yaml"))
14
+ @readers = {}
15
+ Dir[HTML_DIRECTORY + "/readers/*_expected.yaml"].each { |filename| @readers[File.basename(filename).sub(/_expected\.yaml$/, '').to_sym] = File.read(filename) }
16
+ end
17
+
18
+ should "pass basic sanitization and result in Nokogiri documents" do
19
+ @corpus.values.each do |html|
20
+ doc = Document.new(html)
21
+ assert doc.html.length > 1000
22
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
23
+ end
24
+ end
25
+
26
+ should "pass metadata extraction tests" do
27
+
28
+ @metadata.each do |file, expected|
29
+ @doc = Document.new(@corpus[file])
30
+ expected.each do |k, v|
31
+ assert_equal v, @doc.send(k)
32
+ end
33
+ end
34
+ end
35
+
36
+ should "pass base reader content extraction tests" do
37
+ @reader_metadata.each do |file, expected|
38
+ @doc = Reader::Document.create(@corpus[file])
39
+ assert_equal expected, @doc.sentences(2)
40
+ end
41
+ end
42
+
43
+ should "pass reader content extraction tests" do
44
+ @readers.each do |reader, expected|
45
+ results = YAML.load(expected)
46
+ results.each_key do |file|
47
+ @doc = Document.new(@corpus[file], :reader => reader)
48
+ assert_equal results[file], @doc.body
49
+ end
50
+ end
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,34 @@
1
+ require 'helper'
2
+
3
+ class TestPismoDocument < Test::Unit::TestCase
4
+ context "Pismo::Document" do
5
+ should "process an IO/File object" do
6
+ doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
7
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
8
+ end
9
+ end
10
+
11
+ context "A very basic Pismo document" do
12
+ setup do
13
+ @doc = Document.new(%{<html><body><h1>Hello</h1></body></html>})
14
+ end
15
+
16
+ should "pass sanitization" do
17
+ assert_equal %{<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">\n<html><body><h1>Hello</h1></body></html>\n}, @doc.html
18
+ end
19
+
20
+ should "result in a Nokogiri document" do
21
+ assert @doc.doc.kind_of?(Nokogiri::HTML::Document)
22
+ end
23
+ end
24
+
25
+ context "A basic real world blog post" do
26
+ setup do
27
+ @doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
28
+ end
29
+
30
+ should "provide a title" do
31
+ assert_equal "CoffeeScript: A New Language With A Pure Ruby Compiler", @doc.title
32
+ end
33
+ end
34
+ end
metadata ADDED
@@ -0,0 +1,156 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nddrylliog_pismo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Peter Cooper
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2010-12-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: shoulda
16
+ requirement: &14086660 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *14086660
25
+ - !ruby/object:Gem::Dependency
26
+ name: awesome_print
27
+ requirement: &14085980 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *14085980
36
+ - !ruby/object:Gem::Dependency
37
+ name: nokogiri
38
+ requirement: &14085420 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *14085420
47
+ - !ruby/object:Gem::Dependency
48
+ name: sanitize
49
+ requirement: &14084820 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *14084820
58
+ - !ruby/object:Gem::Dependency
59
+ name: fast-stemmer
60
+ requirement: &14084200 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *14084200
69
+ - !ruby/object:Gem::Dependency
70
+ name: chronic
71
+ requirement: &14100720 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: *14100720
80
+ description: Pismo extracts and retrieves content-related metadata from HTML pages
81
+ - you can use the resulting data in an organized way, such as a summary/first paragraph,
82
+ body text, keywords, RSS feed URL, favicon, etc.
83
+ email:
84
+ - git@peterc.org
85
+ executables:
86
+ - pismo
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - .document
91
+ - .gitignore
92
+ - Gemfile
93
+ - LICENSE
94
+ - NOTICE
95
+ - README.markdown
96
+ - Rakefile
97
+ - bin/pismo
98
+ - lib/pismo.rb
99
+ - lib/pismo/document.rb
100
+ - lib/pismo/external_attributes.rb
101
+ - lib/pismo/internal_attributes.rb
102
+ - lib/pismo/reader.rb
103
+ - lib/pismo/reader/base.rb
104
+ - lib/pismo/reader/cluster.rb
105
+ - lib/pismo/reader/tree.rb
106
+ - lib/pismo/stopwords.txt
107
+ - lib/pismo/version.rb
108
+ - pismo.gemspec
109
+ - test/corpus/bbcnews.html
110
+ - test/corpus/bbcnews2.html
111
+ - test/corpus/briancray.html
112
+ - test/corpus/cant_read.html
113
+ - test/corpus/factor.html
114
+ - test/corpus/gmane.html
115
+ - test/corpus/huffington.html
116
+ - test/corpus/metadata_expected.yaml
117
+ - test/corpus/metadata_expected.yaml.old
118
+ - test/corpus/queness.html
119
+ - test/corpus/reader_expected.yaml
120
+ - test/corpus/readers/cluster_expected.yaml
121
+ - test/corpus/readers/tree_expected.yaml
122
+ - test/corpus/rubyinside.html
123
+ - test/corpus/rww.html
124
+ - test/corpus/spolsky.html
125
+ - test/corpus/techcrunch.html
126
+ - test/corpus/tweet.html
127
+ - test/corpus/youtube.html
128
+ - test/corpus/zefrank.html
129
+ - test/helper.rb
130
+ - test/test_corpus.rb
131
+ - test/test_pismo_document.rb
132
+ homepage: http://github.com/peterc/pismo
133
+ licenses: []
134
+ post_install_message:
135
+ rdoc_options: []
136
+ require_paths:
137
+ - lib
138
+ required_ruby_version: !ruby/object:Gem::Requirement
139
+ none: false
140
+ requirements:
141
+ - - ! '>='
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ required_rubygems_version: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ requirements: []
151
+ rubyforge_project: nddrylliog_pismo
152
+ rubygems_version: 1.8.17
153
+ signing_key:
154
+ specification_version: 3
155
+ summary: Extracts or retrieves content-related metadata from HTML pages
156
+ test_files: []