csteamer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'matchy'
5
+ require 'open-uri'
6
+ require 'yaml'
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'csteamer'
11
+
12
+ class Test::Unit::TestCase
13
+ include CSteamer
14
+ HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
15
+ end
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestCorpus < Test::Unit::TestCase
4
+
5
+ context "A corpus of HTML documents" do
6
+ setup do
7
+ # Load the corpus files' HTML content into a hash
8
+ @corpus = {}
9
+ Dir[HTML_DIRECTORY + "/*.html"].each { |filename| @corpus[File.basename(filename).sub(/\.html$/, '').to_sym] = File.read(filename) }
10
+
11
+ # Load the "expected metadata" ready for tests
12
+ @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
+ end
14
+
15
+ should "pass basic sanitization and result in Nokogiri documents" do
16
+ @corpus.values.each do |html|
17
+ doc = Document.new(html)
18
+ doc.html.length.should > 1000
19
+ doc.doc.kind_of?(Nokogiri::HTML::Document).should == true
20
+ end
21
+ end
22
+
23
+ should "pass metadata extraction tests" do
24
+
25
+ @metadata.each do |file, expected|
26
+ @doc = Document.new(@corpus[file])
27
+ expected.each do |k, v|
28
+ @doc.send(k).should == v
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'helper'
2
+
3
+ class TestCSteamerDocument < Test::Unit::TestCase
4
+ context "CSteamer::Document" do
5
+ should "process an IO/File object" do
6
+ doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
7
+ doc.doc.kind_of?(Nokogiri::HTML::Document).should == true
8
+ end
9
+ end
10
+
11
+ context "A very basic CSteamer document" do
12
+ setup do
13
+ @doc = Document.new(%{<html><body><h1>Hello</h1></body></html>})
14
+ end
15
+
16
+ should "pass sanitization" do
17
+ @doc.html.should == %{<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">\n<html><body><h1>Hello</h1></body></html>\n}
18
+ end
19
+
20
+ should "result in a Nokogiri document" do
21
+ @doc.doc.kind_of?(Nokogiri::HTML::Document).should == true
22
+ end
23
+ end
24
+
25
+ context "A basic real world blog post" do
26
+ setup do
27
+ @doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
28
+ end
29
+
30
+ should "provide a title" do
31
+ @doc.title.should == "CoffeeScript: A New Language With A Pure Ruby Compiler"
32
+ end
33
+ end
34
+ end
metadata ADDED
@@ -0,0 +1,126 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csteamer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Peter Cooper
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-11 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: thoughtbot-shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: mhennemeyer-matchy
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: nokogiri
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: loofah
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: httparty
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ description: CSteamer "steams" your content for data you can use in an organized way, such as a summary/first paragraph, del.icio.us tags, first image used in the content block, etc.
66
+ email: git@peterc.org
67
+ executables: []
68
+
69
+ extensions: []
70
+
71
+ extra_rdoc_files:
72
+ - LICENSE
73
+ - README.rdoc
74
+ files:
75
+ - .document
76
+ - .gitignore
77
+ - LICENSE
78
+ - README.rdoc
79
+ - Rakefile
80
+ - VERSION
81
+ - lib/csteamer.rb
82
+ - lib/csteamer/document.rb
83
+ - lib/csteamer/external_attributes.rb
84
+ - lib/csteamer/internal_attributes.rb
85
+ - test/corpus/bbcnews.html
86
+ - test/corpus/briancray.html
87
+ - test/corpus/huffington.html
88
+ - test/corpus/metadata_expected.yaml
89
+ - test/corpus/rubyinside.html
90
+ - test/corpus/rww.html
91
+ - test/corpus/techcrunch.html
92
+ - test/helper.rb
93
+ - test/test_corpus.rb
94
+ - test/test_csteamer_document.rb
95
+ has_rdoc: true
96
+ homepage: http://github.com/peterc/csteamer
97
+ licenses: []
98
+
99
+ post_install_message:
100
+ rdoc_options:
101
+ - --charset=UTF-8
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: "0"
109
+ version:
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: "0"
115
+ version:
116
+ requirements: []
117
+
118
+ rubyforge_project:
119
+ rubygems_version: 1.3.5
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Extracts or retrieves content-related metadata from HTML pages and remote services
123
+ test_files:
124
+ - test/helper.rb
125
+ - test/test_corpus.rb
126
+ - test/test_csteamer_document.rb