scraped_resource 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +3 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/scraped_resource.gemspec +81 -0
- data/spec/models/scraped_resource/attribute_spec.rb +28 -0
- data/spec/models/scraped_resource/base_spec.rb +22 -0
- data/spec/models/scraped_resource/csv/list_spec.rb +64 -0
- data/spec/models/scraped_resource/excel/base_spec.rb +9 -0
- data/spec/models/scraped_resource/excel/list_spec.rb +98 -0
- data/spec/models/scraped_resource/excel/show_spec.rb +0 -0
- data/spec/models/scraped_resource/html/list_spec.rb +67 -0
- data/spec/models/scraped_resource/normalizer/base_spec.rb +8 -0
- data/spec/models/scraped_resource/normalizer/numeric_spec.rb +28 -0
- data/spec/models/scraped_resource/row_spec.rb +18 -0
- data/spec/models/scraped_resource/util_spec.rb +7 -0
- data/spec/spec_helper.rb +38 -0
- metadata +220 -0
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 hasclass
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "scraped_resource"
|
8
|
+
gem.summary = %Q{Webscraping framework}
|
9
|
+
gem.description = %Q{Webscraping framework}
|
10
|
+
gem.email = "sebi.burkhard@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/hasclass/scraped_resource"
|
12
|
+
gem.authors = ["hasclass"]
|
13
|
+
|
14
|
+
gem.add_dependency 'activesupport'
|
15
|
+
gem.add_dependency 'fastercsv', '1.5.3'
|
16
|
+
gem.add_dependency 'mechanize', '1.0.0'
|
17
|
+
gem.add_dependency 'roo', '1.9.3'
|
18
|
+
gem.add_dependency 'google-spreadsheet-ruby'
|
19
|
+
gem.add_dependency 'rubyzip', '0.9.4'
|
20
|
+
gem.add_dependency 'spreadsheet', '0.6.4.1'
|
21
|
+
|
22
|
+
gem.add_development_dependency 'rspec', '1.3.0'
|
23
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
24
|
+
end
|
25
|
+
Jeweler::GemcutterTasks.new
|
26
|
+
rescue LoadError
|
27
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
28
|
+
end
|
29
|
+
|
30
|
+
require 'spec/rake/spectask'
|
31
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
32
|
+
spec.libs << 'lib' << 'spec'
|
33
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
34
|
+
end
|
35
|
+
|
36
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
37
|
+
spec.libs << 'lib' << 'spec'
|
38
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
39
|
+
spec.rcov = true
|
40
|
+
end
|
41
|
+
|
42
|
+
task :spec => :check_dependencies
|
43
|
+
|
44
|
+
task :default => :spec
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "scraped #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{scraped_resource}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["hasclass"]
|
12
|
+
s.date = %q{2010-10-01}
|
13
|
+
s.description = %q{Webscraping framework}
|
14
|
+
s.email = %q{sebi.burkhard@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"LICENSE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"scraped_resource.gemspec"
|
26
|
+
]
|
27
|
+
s.homepage = %q{http://github.com/hasclass/scraped_resource}
|
28
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
29
|
+
s.require_paths = ["lib"]
|
30
|
+
s.rubygems_version = %q{1.3.7}
|
31
|
+
s.summary = %q{Webscraping framework}
|
32
|
+
s.test_files = [
|
33
|
+
"spec/models/scraped_resource/attribute_spec.rb",
|
34
|
+
"spec/models/scraped_resource/base_spec.rb",
|
35
|
+
"spec/models/scraped_resource/csv/list_spec.rb",
|
36
|
+
"spec/models/scraped_resource/excel/base_spec.rb",
|
37
|
+
"spec/models/scraped_resource/excel/list_spec.rb",
|
38
|
+
"spec/models/scraped_resource/excel/show_spec.rb",
|
39
|
+
"spec/models/scraped_resource/html/list_spec.rb",
|
40
|
+
"spec/models/scraped_resource/normalizer/base_spec.rb",
|
41
|
+
"spec/models/scraped_resource/normalizer/numeric_spec.rb",
|
42
|
+
"spec/models/scraped_resource/row_spec.rb",
|
43
|
+
"spec/models/scraped_resource/util_spec.rb",
|
44
|
+
"spec/spec_helper.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_runtime_dependency(%q<activesupport>, [">= 0"])
|
53
|
+
s.add_runtime_dependency(%q<fastercsv>, ["= 1.5.3"])
|
54
|
+
s.add_runtime_dependency(%q<mechanize>, ["= 1.0.0"])
|
55
|
+
s.add_runtime_dependency(%q<roo>, ["= 1.9.3"])
|
56
|
+
s.add_runtime_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
57
|
+
s.add_runtime_dependency(%q<rubyzip>, ["= 0.9.4"])
|
58
|
+
s.add_runtime_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
59
|
+
s.add_development_dependency(%q<rspec>, ["= 1.3.0"])
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<activesupport>, [">= 0"])
|
62
|
+
s.add_dependency(%q<fastercsv>, ["= 1.5.3"])
|
63
|
+
s.add_dependency(%q<mechanize>, ["= 1.0.0"])
|
64
|
+
s.add_dependency(%q<roo>, ["= 1.9.3"])
|
65
|
+
s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
66
|
+
s.add_dependency(%q<rubyzip>, ["= 0.9.4"])
|
67
|
+
s.add_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
68
|
+
s.add_dependency(%q<rspec>, ["= 1.3.0"])
|
69
|
+
end
|
70
|
+
else
|
71
|
+
s.add_dependency(%q<activesupport>, [">= 0"])
|
72
|
+
s.add_dependency(%q<fastercsv>, ["= 1.5.3"])
|
73
|
+
s.add_dependency(%q<mechanize>, ["= 1.0.0"])
|
74
|
+
s.add_dependency(%q<roo>, ["= 1.9.3"])
|
75
|
+
s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
76
|
+
s.add_dependency(%q<rubyzip>, ["= 0.9.4"])
|
77
|
+
s.add_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
78
|
+
s.add_dependency(%q<rspec>, ["= 1.3.0"])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
|
6
|
+
|
7
|
+
describe Attribute do
|
8
|
+
before { @attr = Attribute.new('100') }
|
9
|
+
|
10
|
+
describe "one formatter" do
|
11
|
+
before {
|
12
|
+
@attr = Attribute.new('100', :format => :to_float )
|
13
|
+
@attr.should_receive(:to_float).with('100').and_return(100)
|
14
|
+
}
|
15
|
+
specify { @attr.value.should == 100 }
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "format chain" do
|
19
|
+
before {
|
20
|
+
@attr = Attribute.new('100', :format => [:chain_one, :chain_two] )
|
21
|
+
@attr.should_receive(:chain_one).once.with('100').and_return(90)
|
22
|
+
@attr.should_receive(:chain_two).once.with(90).and_return(80)
|
23
|
+
}
|
24
|
+
specify { @attr.value.should == 80 }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module ScrapedResource
|
4
|
+
|
5
|
+
describe Base do
|
6
|
+
|
7
|
+
describe "#options" do
|
8
|
+
subject { Base.options }
|
9
|
+
specify { Base.options[:base_path].should == 'tmp/scraped_resources'}
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#options[:base_path] = something_else" do
|
13
|
+
before do
|
14
|
+
@new_path = 'new_path'
|
15
|
+
Base.options[:base_path] = @new_path
|
16
|
+
end
|
17
|
+
specify { Base.options[:base_path].should == @new_path}
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Csv
|
5
|
+
|
6
|
+
describe List do
|
7
|
+
class Sample1 < ScrapedResource::Csv::List
|
8
|
+
csv_list do |config|
|
9
|
+
config.headers 1
|
10
|
+
end
|
11
|
+
attribute :name, :key => :name
|
12
|
+
end
|
13
|
+
|
14
|
+
before { @list = Sample1.new('spec/files/csv/sample.csv') }
|
15
|
+
|
16
|
+
subject do
|
17
|
+
@list
|
18
|
+
end
|
19
|
+
its(:document) { should_not be_nil }
|
20
|
+
|
21
|
+
describe :headers do
|
22
|
+
subject {@list.headers}
|
23
|
+
|
24
|
+
it { should_not be_empty}
|
25
|
+
it { should include(:name) }
|
26
|
+
end
|
27
|
+
|
28
|
+
describe :results do
|
29
|
+
before { @results = @list.to_a }
|
30
|
+
subject { @results }
|
31
|
+
|
32
|
+
it { should have(2).items }
|
33
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
34
|
+
specify { @results.any?{|r| r[:name] == 'Hello'}.should be_true }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe List do
|
39
|
+
class Sample2 < ScrapedResource::Csv::List
|
40
|
+
csv_list do |config|
|
41
|
+
config.headers 3
|
42
|
+
end
|
43
|
+
attribute :name, :key => :name
|
44
|
+
end
|
45
|
+
|
46
|
+
before { @list = Sample2.new('spec/files/csv/sample2.csv') }
|
47
|
+
|
48
|
+
describe :headers do
|
49
|
+
subject {@list.headers}
|
50
|
+
it { should_not be_empty}
|
51
|
+
it { should include(:name) }
|
52
|
+
end
|
53
|
+
|
54
|
+
describe :results do
|
55
|
+
before { @results = @list.to_a }
|
56
|
+
subject { @results }
|
57
|
+
|
58
|
+
it { should have(2).items }
|
59
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
60
|
+
specify { @results.any?{|r| r[:name] == 'Hello'}.should be_true }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Excel
|
5
|
+
|
6
|
+
|
7
|
+
describe List do
|
8
|
+
class Sample1 < ScrapedResource::Excel::List
|
9
|
+
excel_list do |config|
|
10
|
+
config.headers 5
|
11
|
+
end
|
12
|
+
|
13
|
+
attribute :name, :key => :name
|
14
|
+
attribute :custom, :method => :custom_method
|
15
|
+
attribute :list_of_names, :key => [:sym, :symbol]
|
16
|
+
|
17
|
+
attribute :name_plus_foo do |row, value|
|
18
|
+
"#{row[:name]}_plus"
|
19
|
+
end
|
20
|
+
|
21
|
+
attribute :block_with_column, :key => :name do |row, value|
|
22
|
+
"#{value}_plus"
|
23
|
+
end
|
24
|
+
|
25
|
+
def custom_method
|
26
|
+
'custom'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
before { @list = Sample1.new('spec/files/excel/sample1.xls') }
|
31
|
+
|
32
|
+
subject do
|
33
|
+
@list
|
34
|
+
end
|
35
|
+
|
36
|
+
its(:document) { should_not be_nil }
|
37
|
+
|
38
|
+
describe :headers do
|
39
|
+
subject {@list.headers}
|
40
|
+
|
41
|
+
it { should_not be_empty}
|
42
|
+
it { should include(:symbol) }
|
43
|
+
it { should include(:indexanbieter) }
|
44
|
+
it { should include(:symbol) }
|
45
|
+
end
|
46
|
+
|
47
|
+
describe :results do
|
48
|
+
before { @results = @list.to_a }
|
49
|
+
subject { @results }
|
50
|
+
|
51
|
+
it { should have(4).items }
|
52
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
53
|
+
specify { @results.first.has_key?(:custom).should be_true }
|
54
|
+
specify { @results.first[:list_of_names].should == '100GBA' }
|
55
|
+
specify { @results.first[:custom].should == 'custom' }
|
56
|
+
specify { @results.first[:name].should == 'Foo' }
|
57
|
+
specify { @results.first[:name_plus_foo].should == 'Foo_plus' }
|
58
|
+
specify { @results.first[:block_with_column].should == 'Foo_plus' }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe List, "#reject_row_if" do
|
63
|
+
class SampleRejectRow < ScrapedResource::Excel::List
|
64
|
+
excel_list do |config|
|
65
|
+
config.headers 5
|
66
|
+
end
|
67
|
+
|
68
|
+
attribute :name, :key => :name
|
69
|
+
|
70
|
+
reject_row_if {|row| row[:name].blank? }
|
71
|
+
reject_row_if {|row| row[:name] == 'Baz' }
|
72
|
+
end
|
73
|
+
|
74
|
+
subject do
|
75
|
+
SampleRejectRow.new('spec/files/excel/sample1.xls')
|
76
|
+
end
|
77
|
+
|
78
|
+
its(:to_a) { should have(2).items }
|
79
|
+
end
|
80
|
+
|
81
|
+
describe List, "#header" do
|
82
|
+
class SampleHeaderWithBlock < ScrapedResource::Excel::List
|
83
|
+
|
84
|
+
excel_list do |config|
|
85
|
+
config.headers do |list|
|
86
|
+
{:name => 1}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
attribute :name, :key => :name
|
91
|
+
end
|
92
|
+
|
93
|
+
subject do
|
94
|
+
SampleHeaderWithBlock.new('spec/files/excel/sample1.xls')
|
95
|
+
end
|
96
|
+
its(:to_a) { should have(9).items }
|
97
|
+
end
|
98
|
+
end
|
File without changes
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Html
|
5
|
+
describe List do
|
6
|
+
class Xetra < ScrapedResource::Html::List
|
7
|
+
html_list do |config|
|
8
|
+
config.table 'table.fulldouble'
|
9
|
+
config.rows 'tr.sorter ~ tr'
|
10
|
+
config.cells 'td'
|
11
|
+
config.cell_values :inner_html
|
12
|
+
|
13
|
+
config.headers do |base|
|
14
|
+
hsh = {}
|
15
|
+
base.document./("table.fulldouble:first-of-type tr")[0]./("th").each_with_index do |cells, i|
|
16
|
+
txt = cells.inner_text || 'undefined'
|
17
|
+
hsh[ScrapedResource::Utilities.slug(txt).to_sym] = i
|
18
|
+
end
|
19
|
+
hsh
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attribute :isin, :key => :nameisin do |row, value| value.split('<br>').last; end
|
24
|
+
attribute :default, :default => 'foo'
|
25
|
+
attribute :custom, :method => :custom_method
|
26
|
+
attribute :list_of_names, :key => [:sym, :symbol]
|
27
|
+
attribute :name_plus_foo do |row, value| "#{row[:isin]}_plus"; end
|
28
|
+
attribute :block_with_column, :key => :name do |row, value| "#{value}_plus"; end
|
29
|
+
|
30
|
+
def custom_method
|
31
|
+
'custom'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
before { @list = Xetra.new('spec/files/html/sample.html') }
|
36
|
+
|
37
|
+
subject do
|
38
|
+
@list
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
its(:document) { should_not be_nil }
|
43
|
+
|
44
|
+
describe :headers do
|
45
|
+
subject {@list.headers}
|
46
|
+
|
47
|
+
it { should_not be_empty}
|
48
|
+
it { should include(:nameisin) }
|
49
|
+
specify { @list.headers[:nameisin].should == 0 }
|
50
|
+
end
|
51
|
+
|
52
|
+
describe :results do
|
53
|
+
before { @results = @list.to_a }
|
54
|
+
subject { @results }
|
55
|
+
|
56
|
+
it { should have(20).items }
|
57
|
+
# specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
58
|
+
# specify { @results.first.has_key?(:custom).should be_true }
|
59
|
+
# specify { @results.first[:list_of_names].should == '100GBA' }
|
60
|
+
# specify { @results.first[:custom].should == 'custom' }
|
61
|
+
specify { @results.first[:isin].should == 'FR0010821728' }
|
62
|
+
specify { @results.first[:default].should == 'foo' }
|
63
|
+
# specify { @results.first[:name_plus_foo].should == 'Foo_plus' }
|
64
|
+
# specify { @results.first[:block_with_column].should == 'Foo_plus' }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
describe "floats" do
|
6
|
+
before { @result = 1005.42}
|
7
|
+
|
8
|
+
specify { n = "1005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
9
|
+
specify { n = "1005,42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
10
|
+
specify { n = "1,005,420"; Normalizer.to_numeric(n).should be_close(1005420.0, 0.001) }
|
11
|
+
|
12
|
+
specify { n = "1 005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
13
|
+
specify { n = "1,005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
14
|
+
specify { n = "1'005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
15
|
+
specify { n = "1 a 005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
16
|
+
specify { n = "$ 1005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
17
|
+
specify { n = "1'005.420,000,251"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
18
|
+
end
|
19
|
+
describe "ints" do
|
20
|
+
before { @result = 1005.0}
|
21
|
+
specify { n = "1005"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
22
|
+
specify { n = "1,005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
23
|
+
specify { n = "1.005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
24
|
+
specify { n = "1'005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
25
|
+
|
26
|
+
specify { Normalizer.to_numeric("1,000", :separator => '.').should be_close(1000.0, 0.01) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
describe Row, '#normalize_value' do
|
6
|
+
describe "one normalizer" do
|
7
|
+
before { @row = Row.new(nil,nil,nil) }
|
8
|
+
specify { @row.normalize_value([:numeric], "145.01").should == 145.01 }
|
9
|
+
end
|
10
|
+
describe "chained normalizers" do
|
11
|
+
before do
|
12
|
+
@row = Row.new(nil,nil,nil)
|
13
|
+
@row.should_receive(:to_foo).and_return(10)
|
14
|
+
end
|
15
|
+
specify { @row.normalize_value([:numeric, :foo], "145.01").should == 10 }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'init'
|
2
|
+
#require 'lib/scraped_resource'
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
def check_results(path, klass)
|
7
|
+
Dir[path].each do |results_file|
|
8
|
+
source_file = results_file.gsub('.yml', '')
|
9
|
+
next unless File.exists?(source_file)
|
10
|
+
|
11
|
+
describe results_file do
|
12
|
+
before(:all) do
|
13
|
+
@spider = klass.list_mapper.new(source_file)
|
14
|
+
@arr = @spider.to_a
|
15
|
+
end
|
16
|
+
|
17
|
+
specify {@arr.should_not be_nil}
|
18
|
+
|
19
|
+
results = YAML::load_file(results_file) rescue []
|
20
|
+
puts path if results.empty?
|
21
|
+
results.each_with_index do |result, i|
|
22
|
+
it "#{results_file}row #{i} should be the same" do
|
23
|
+
result.each do |key, expected_value|
|
24
|
+
next if key == :spidered_at
|
25
|
+
# compare string values, otherwise comparing floats with each other sucks
|
26
|
+
parsed_value = @arr[i][key]
|
27
|
+
if parsed_value.is_a?(Float)
|
28
|
+
parsed_value.should be_close(expected_value, 0.0001)
|
29
|
+
else
|
30
|
+
parsed_value.to_s.should ==(expected_value.to_s)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end#describe
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
metadata
ADDED
@@ -0,0 +1,220 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraped_resource
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- hasclass
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-10-01 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: activesupport
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: fastercsv
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - "="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 5
|
44
|
+
segments:
|
45
|
+
- 1
|
46
|
+
- 5
|
47
|
+
- 3
|
48
|
+
version: 1.5.3
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: mechanize
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - "="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 23
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
63
|
+
- 0
|
64
|
+
version: 1.0.0
|
65
|
+
type: :runtime
|
66
|
+
version_requirements: *id003
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
name: roo
|
69
|
+
prerelease: false
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - "="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 53
|
76
|
+
segments:
|
77
|
+
- 1
|
78
|
+
- 9
|
79
|
+
- 3
|
80
|
+
version: 1.9.3
|
81
|
+
type: :runtime
|
82
|
+
version_requirements: *id004
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: google-spreadsheet-ruby
|
85
|
+
prerelease: false
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
type: :runtime
|
96
|
+
version_requirements: *id005
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubyzip
|
99
|
+
prerelease: false
|
100
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - "="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 51
|
106
|
+
segments:
|
107
|
+
- 0
|
108
|
+
- 9
|
109
|
+
- 4
|
110
|
+
version: 0.9.4
|
111
|
+
type: :runtime
|
112
|
+
version_requirements: *id006
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: spreadsheet
|
115
|
+
prerelease: false
|
116
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - "="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 109
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
- 6
|
125
|
+
- 4
|
126
|
+
- 1
|
127
|
+
version: 0.6.4.1
|
128
|
+
type: :runtime
|
129
|
+
version_requirements: *id007
|
130
|
+
- !ruby/object:Gem::Dependency
|
131
|
+
name: rspec
|
132
|
+
prerelease: false
|
133
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
134
|
+
none: false
|
135
|
+
requirements:
|
136
|
+
- - "="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
hash: 27
|
139
|
+
segments:
|
140
|
+
- 1
|
141
|
+
- 3
|
142
|
+
- 0
|
143
|
+
version: 1.3.0
|
144
|
+
type: :development
|
145
|
+
version_requirements: *id008
|
146
|
+
description: Webscraping framework
|
147
|
+
email: sebi.burkhard@gmail.com
|
148
|
+
executables: []
|
149
|
+
|
150
|
+
extensions: []
|
151
|
+
|
152
|
+
extra_rdoc_files:
|
153
|
+
- LICENSE
|
154
|
+
- README.rdoc
|
155
|
+
files:
|
156
|
+
- .gitignore
|
157
|
+
- LICENSE
|
158
|
+
- README.rdoc
|
159
|
+
- Rakefile
|
160
|
+
- VERSION
|
161
|
+
- scraped_resource.gemspec
|
162
|
+
- spec/models/scraped_resource/attribute_spec.rb
|
163
|
+
- spec/models/scraped_resource/base_spec.rb
|
164
|
+
- spec/models/scraped_resource/csv/list_spec.rb
|
165
|
+
- spec/models/scraped_resource/excel/base_spec.rb
|
166
|
+
- spec/models/scraped_resource/excel/list_spec.rb
|
167
|
+
- spec/models/scraped_resource/excel/show_spec.rb
|
168
|
+
- spec/models/scraped_resource/html/list_spec.rb
|
169
|
+
- spec/models/scraped_resource/normalizer/base_spec.rb
|
170
|
+
- spec/models/scraped_resource/normalizer/numeric_spec.rb
|
171
|
+
- spec/models/scraped_resource/row_spec.rb
|
172
|
+
- spec/models/scraped_resource/util_spec.rb
|
173
|
+
- spec/spec_helper.rb
|
174
|
+
has_rdoc: true
|
175
|
+
homepage: http://github.com/hasclass/scraped_resource
|
176
|
+
licenses: []
|
177
|
+
|
178
|
+
post_install_message:
|
179
|
+
rdoc_options:
|
180
|
+
- --charset=UTF-8
|
181
|
+
require_paths:
|
182
|
+
- lib
|
183
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
184
|
+
none: false
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
hash: 3
|
189
|
+
segments:
|
190
|
+
- 0
|
191
|
+
version: "0"
|
192
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
hash: 3
|
198
|
+
segments:
|
199
|
+
- 0
|
200
|
+
version: "0"
|
201
|
+
requirements: []
|
202
|
+
|
203
|
+
rubyforge_project:
|
204
|
+
rubygems_version: 1.3.7
|
205
|
+
signing_key:
|
206
|
+
specification_version: 3
|
207
|
+
summary: Webscraping framework
|
208
|
+
test_files:
|
209
|
+
- spec/models/scraped_resource/attribute_spec.rb
|
210
|
+
- spec/models/scraped_resource/base_spec.rb
|
211
|
+
- spec/models/scraped_resource/csv/list_spec.rb
|
212
|
+
- spec/models/scraped_resource/excel/base_spec.rb
|
213
|
+
- spec/models/scraped_resource/excel/list_spec.rb
|
214
|
+
- spec/models/scraped_resource/excel/show_spec.rb
|
215
|
+
- spec/models/scraped_resource/html/list_spec.rb
|
216
|
+
- spec/models/scraped_resource/normalizer/base_spec.rb
|
217
|
+
- spec/models/scraped_resource/normalizer/numeric_spec.rb
|
218
|
+
- spec/models/scraped_resource/row_spec.rb
|
219
|
+
- spec/models/scraped_resource/util_spec.rb
|
220
|
+
- spec/spec_helper.rb
|