scraped_resource 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +3 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/scraped_resource.gemspec +81 -0
- data/spec/models/scraped_resource/attribute_spec.rb +28 -0
- data/spec/models/scraped_resource/base_spec.rb +22 -0
- data/spec/models/scraped_resource/csv/list_spec.rb +64 -0
- data/spec/models/scraped_resource/excel/base_spec.rb +9 -0
- data/spec/models/scraped_resource/excel/list_spec.rb +98 -0
- data/spec/models/scraped_resource/excel/show_spec.rb +0 -0
- data/spec/models/scraped_resource/html/list_spec.rb +67 -0
- data/spec/models/scraped_resource/normalizer/base_spec.rb +8 -0
- data/spec/models/scraped_resource/normalizer/numeric_spec.rb +28 -0
- data/spec/models/scraped_resource/row_spec.rb +18 -0
- data/spec/models/scraped_resource/util_spec.rb +7 -0
- data/spec/spec_helper.rb +38 -0
- metadata +220 -0
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 hasclass
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "scraped_resource"
|
8
|
+
gem.summary = %Q{Webscraping framework}
|
9
|
+
gem.description = %Q{Webscraping framework}
|
10
|
+
gem.email = "sebi.burkhard@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/hasclass/scraped_resource"
|
12
|
+
gem.authors = ["hasclass"]
|
13
|
+
|
14
|
+
gem.add_dependency 'activesupport'
|
15
|
+
gem.add_dependency 'fastercsv', '1.5.3'
|
16
|
+
gem.add_dependency 'mechanize', '1.0.0'
|
17
|
+
gem.add_dependency 'roo', '1.9.3'
|
18
|
+
gem.add_dependency 'google-spreadsheet-ruby'
|
19
|
+
gem.add_dependency 'rubyzip', '0.9.4'
|
20
|
+
gem.add_dependency 'spreadsheet', '0.6.4.1'
|
21
|
+
|
22
|
+
gem.add_development_dependency 'rspec', '1.3.0'
|
23
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
24
|
+
end
|
25
|
+
Jeweler::GemcutterTasks.new
|
26
|
+
rescue LoadError
|
27
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
28
|
+
end
|
29
|
+
|
30
|
+
require 'spec/rake/spectask'
|
31
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
32
|
+
spec.libs << 'lib' << 'spec'
|
33
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
34
|
+
end
|
35
|
+
|
36
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
37
|
+
spec.libs << 'lib' << 'spec'
|
38
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
39
|
+
spec.rcov = true
|
40
|
+
end
|
41
|
+
|
42
|
+
task :spec => :check_dependencies
|
43
|
+
|
44
|
+
task :default => :spec
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "scraped #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{scraped_resource}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["hasclass"]
|
12
|
+
s.date = %q{2010-10-01}
|
13
|
+
s.description = %q{Webscraping framework}
|
14
|
+
s.email = %q{sebi.burkhard@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"LICENSE",
|
22
|
+
"README.rdoc",
|
23
|
+
"Rakefile",
|
24
|
+
"VERSION",
|
25
|
+
"scraped_resource.gemspec"
|
26
|
+
]
|
27
|
+
s.homepage = %q{http://github.com/hasclass/scraped_resource}
|
28
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
29
|
+
s.require_paths = ["lib"]
|
30
|
+
s.rubygems_version = %q{1.3.7}
|
31
|
+
s.summary = %q{Webscraping framework}
|
32
|
+
s.test_files = [
|
33
|
+
"spec/models/scraped_resource/attribute_spec.rb",
|
34
|
+
"spec/models/scraped_resource/base_spec.rb",
|
35
|
+
"spec/models/scraped_resource/csv/list_spec.rb",
|
36
|
+
"spec/models/scraped_resource/excel/base_spec.rb",
|
37
|
+
"spec/models/scraped_resource/excel/list_spec.rb",
|
38
|
+
"spec/models/scraped_resource/excel/show_spec.rb",
|
39
|
+
"spec/models/scraped_resource/html/list_spec.rb",
|
40
|
+
"spec/models/scraped_resource/normalizer/base_spec.rb",
|
41
|
+
"spec/models/scraped_resource/normalizer/numeric_spec.rb",
|
42
|
+
"spec/models/scraped_resource/row_spec.rb",
|
43
|
+
"spec/models/scraped_resource/util_spec.rb",
|
44
|
+
"spec/spec_helper.rb"
|
45
|
+
]
|
46
|
+
|
47
|
+
if s.respond_to? :specification_version then
|
48
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
49
|
+
s.specification_version = 3
|
50
|
+
|
51
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
52
|
+
s.add_runtime_dependency(%q<activesupport>, [">= 0"])
|
53
|
+
s.add_runtime_dependency(%q<fastercsv>, ["= 1.5.3"])
|
54
|
+
s.add_runtime_dependency(%q<mechanize>, ["= 1.0.0"])
|
55
|
+
s.add_runtime_dependency(%q<roo>, ["= 1.9.3"])
|
56
|
+
s.add_runtime_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
57
|
+
s.add_runtime_dependency(%q<rubyzip>, ["= 0.9.4"])
|
58
|
+
s.add_runtime_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
59
|
+
s.add_development_dependency(%q<rspec>, ["= 1.3.0"])
|
60
|
+
else
|
61
|
+
s.add_dependency(%q<activesupport>, [">= 0"])
|
62
|
+
s.add_dependency(%q<fastercsv>, ["= 1.5.3"])
|
63
|
+
s.add_dependency(%q<mechanize>, ["= 1.0.0"])
|
64
|
+
s.add_dependency(%q<roo>, ["= 1.9.3"])
|
65
|
+
s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
66
|
+
s.add_dependency(%q<rubyzip>, ["= 0.9.4"])
|
67
|
+
s.add_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
68
|
+
s.add_dependency(%q<rspec>, ["= 1.3.0"])
|
69
|
+
end
|
70
|
+
else
|
71
|
+
s.add_dependency(%q<activesupport>, [">= 0"])
|
72
|
+
s.add_dependency(%q<fastercsv>, ["= 1.5.3"])
|
73
|
+
s.add_dependency(%q<mechanize>, ["= 1.0.0"])
|
74
|
+
s.add_dependency(%q<roo>, ["= 1.9.3"])
|
75
|
+
s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"])
|
76
|
+
s.add_dependency(%q<rubyzip>, ["= 0.9.4"])
|
77
|
+
s.add_dependency(%q<spreadsheet>, ["= 0.6.4.1"])
|
78
|
+
s.add_dependency(%q<rspec>, ["= 1.3.0"])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
|
6
|
+
|
7
|
+
describe Attribute do
|
8
|
+
before { @attr = Attribute.new('100') }
|
9
|
+
|
10
|
+
describe "one formatter" do
|
11
|
+
before {
|
12
|
+
@attr = Attribute.new('100', :format => :to_float )
|
13
|
+
@attr.should_receive(:to_float).with('100').and_return(100)
|
14
|
+
}
|
15
|
+
specify { @attr.value.should == 100 }
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "format chain" do
|
19
|
+
before {
|
20
|
+
@attr = Attribute.new('100', :format => [:chain_one, :chain_two] )
|
21
|
+
@attr.should_receive(:chain_one).once.with('100').and_return(90)
|
22
|
+
@attr.should_receive(:chain_two).once.with(90).and_return(80)
|
23
|
+
}
|
24
|
+
specify { @attr.value.should == 80 }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module ScrapedResource
|
4
|
+
|
5
|
+
describe Base do
|
6
|
+
|
7
|
+
describe "#options" do
|
8
|
+
subject { Base.options }
|
9
|
+
specify { Base.options[:base_path].should == 'tmp/scraped_resources'}
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "#options[:base_path] = something_else" do
|
13
|
+
before do
|
14
|
+
@new_path = 'new_path'
|
15
|
+
Base.options[:base_path] = @new_path
|
16
|
+
end
|
17
|
+
specify { Base.options[:base_path].should == @new_path}
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Csv
|
5
|
+
|
6
|
+
describe List do
|
7
|
+
class Sample1 < ScrapedResource::Csv::List
|
8
|
+
csv_list do |config|
|
9
|
+
config.headers 1
|
10
|
+
end
|
11
|
+
attribute :name, :key => :name
|
12
|
+
end
|
13
|
+
|
14
|
+
before { @list = Sample1.new('spec/files/csv/sample.csv') }
|
15
|
+
|
16
|
+
subject do
|
17
|
+
@list
|
18
|
+
end
|
19
|
+
its(:document) { should_not be_nil }
|
20
|
+
|
21
|
+
describe :headers do
|
22
|
+
subject {@list.headers}
|
23
|
+
|
24
|
+
it { should_not be_empty}
|
25
|
+
it { should include(:name) }
|
26
|
+
end
|
27
|
+
|
28
|
+
describe :results do
|
29
|
+
before { @results = @list.to_a }
|
30
|
+
subject { @results }
|
31
|
+
|
32
|
+
it { should have(2).items }
|
33
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
34
|
+
specify { @results.any?{|r| r[:name] == 'Hello'}.should be_true }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
describe List do
|
39
|
+
class Sample2 < ScrapedResource::Csv::List
|
40
|
+
csv_list do |config|
|
41
|
+
config.headers 3
|
42
|
+
end
|
43
|
+
attribute :name, :key => :name
|
44
|
+
end
|
45
|
+
|
46
|
+
before { @list = Sample2.new('spec/files/csv/sample2.csv') }
|
47
|
+
|
48
|
+
describe :headers do
|
49
|
+
subject {@list.headers}
|
50
|
+
it { should_not be_empty}
|
51
|
+
it { should include(:name) }
|
52
|
+
end
|
53
|
+
|
54
|
+
describe :results do
|
55
|
+
before { @results = @list.to_a }
|
56
|
+
subject { @results }
|
57
|
+
|
58
|
+
it { should have(2).items }
|
59
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
60
|
+
specify { @results.any?{|r| r[:name] == 'Hello'}.should be_true }
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Excel
|
5
|
+
|
6
|
+
|
7
|
+
describe List do
|
8
|
+
class Sample1 < ScrapedResource::Excel::List
|
9
|
+
excel_list do |config|
|
10
|
+
config.headers 5
|
11
|
+
end
|
12
|
+
|
13
|
+
attribute :name, :key => :name
|
14
|
+
attribute :custom, :method => :custom_method
|
15
|
+
attribute :list_of_names, :key => [:sym, :symbol]
|
16
|
+
|
17
|
+
attribute :name_plus_foo do |row, value|
|
18
|
+
"#{row[:name]}_plus"
|
19
|
+
end
|
20
|
+
|
21
|
+
attribute :block_with_column, :key => :name do |row, value|
|
22
|
+
"#{value}_plus"
|
23
|
+
end
|
24
|
+
|
25
|
+
def custom_method
|
26
|
+
'custom'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
before { @list = Sample1.new('spec/files/excel/sample1.xls') }
|
31
|
+
|
32
|
+
subject do
|
33
|
+
@list
|
34
|
+
end
|
35
|
+
|
36
|
+
its(:document) { should_not be_nil }
|
37
|
+
|
38
|
+
describe :headers do
|
39
|
+
subject {@list.headers}
|
40
|
+
|
41
|
+
it { should_not be_empty}
|
42
|
+
it { should include(:symbol) }
|
43
|
+
it { should include(:indexanbieter) }
|
44
|
+
it { should include(:symbol) }
|
45
|
+
end
|
46
|
+
|
47
|
+
describe :results do
|
48
|
+
before { @results = @list.to_a }
|
49
|
+
subject { @results }
|
50
|
+
|
51
|
+
it { should have(4).items }
|
52
|
+
specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
53
|
+
specify { @results.first.has_key?(:custom).should be_true }
|
54
|
+
specify { @results.first[:list_of_names].should == '100GBA' }
|
55
|
+
specify { @results.first[:custom].should == 'custom' }
|
56
|
+
specify { @results.first[:name].should == 'Foo' }
|
57
|
+
specify { @results.first[:name_plus_foo].should == 'Foo_plus' }
|
58
|
+
specify { @results.first[:block_with_column].should == 'Foo_plus' }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe List, "#reject_row_if" do
|
63
|
+
class SampleRejectRow < ScrapedResource::Excel::List
|
64
|
+
excel_list do |config|
|
65
|
+
config.headers 5
|
66
|
+
end
|
67
|
+
|
68
|
+
attribute :name, :key => :name
|
69
|
+
|
70
|
+
reject_row_if {|row| row[:name].blank? }
|
71
|
+
reject_row_if {|row| row[:name] == 'Baz' }
|
72
|
+
end
|
73
|
+
|
74
|
+
subject do
|
75
|
+
SampleRejectRow.new('spec/files/excel/sample1.xls')
|
76
|
+
end
|
77
|
+
|
78
|
+
its(:to_a) { should have(2).items }
|
79
|
+
end
|
80
|
+
|
81
|
+
describe List, "#header" do
|
82
|
+
class SampleHeaderWithBlock < ScrapedResource::Excel::List
|
83
|
+
|
84
|
+
excel_list do |config|
|
85
|
+
config.headers do |list|
|
86
|
+
{:name => 1}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
attribute :name, :key => :name
|
91
|
+
end
|
92
|
+
|
93
|
+
subject do
|
94
|
+
SampleHeaderWithBlock.new('spec/files/excel/sample1.xls')
|
95
|
+
end
|
96
|
+
its(:to_a) { should have(9).items }
|
97
|
+
end
|
98
|
+
end
|
File without changes
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource::Html
|
5
|
+
describe List do
|
6
|
+
class Xetra < ScrapedResource::Html::List
|
7
|
+
html_list do |config|
|
8
|
+
config.table 'table.fulldouble'
|
9
|
+
config.rows 'tr.sorter ~ tr'
|
10
|
+
config.cells 'td'
|
11
|
+
config.cell_values :inner_html
|
12
|
+
|
13
|
+
config.headers do |base|
|
14
|
+
hsh = {}
|
15
|
+
base.document./("table.fulldouble:first-of-type tr")[0]./("th").each_with_index do |cells, i|
|
16
|
+
txt = cells.inner_text || 'undefined'
|
17
|
+
hsh[ScrapedResource::Utilities.slug(txt).to_sym] = i
|
18
|
+
end
|
19
|
+
hsh
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attribute :isin, :key => :nameisin do |row, value| value.split('<br>').last; end
|
24
|
+
attribute :default, :default => 'foo'
|
25
|
+
attribute :custom, :method => :custom_method
|
26
|
+
attribute :list_of_names, :key => [:sym, :symbol]
|
27
|
+
attribute :name_plus_foo do |row, value| "#{row[:isin]}_plus"; end
|
28
|
+
attribute :block_with_column, :key => :name do |row, value| "#{value}_plus"; end
|
29
|
+
|
30
|
+
def custom_method
|
31
|
+
'custom'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
before { @list = Xetra.new('spec/files/html/sample.html') }
|
36
|
+
|
37
|
+
subject do
|
38
|
+
@list
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
its(:document) { should_not be_nil }
|
43
|
+
|
44
|
+
describe :headers do
|
45
|
+
subject {@list.headers}
|
46
|
+
|
47
|
+
it { should_not be_empty}
|
48
|
+
it { should include(:nameisin) }
|
49
|
+
specify { @list.headers[:nameisin].should == 0 }
|
50
|
+
end
|
51
|
+
|
52
|
+
describe :results do
|
53
|
+
before { @results = @list.to_a }
|
54
|
+
subject { @results }
|
55
|
+
|
56
|
+
it { should have(20).items }
|
57
|
+
# specify { @results.any?{|r| r[:name] == 'Foo'}.should be_true }
|
58
|
+
# specify { @results.first.has_key?(:custom).should be_true }
|
59
|
+
# specify { @results.first[:list_of_names].should == '100GBA' }
|
60
|
+
# specify { @results.first[:custom].should == 'custom' }
|
61
|
+
specify { @results.first[:isin].should == 'FR0010821728' }
|
62
|
+
specify { @results.first[:default].should == 'foo' }
|
63
|
+
# specify { @results.first[:name_plus_foo].should == 'Foo_plus' }
|
64
|
+
# specify { @results.first[:block_with_column].should == 'Foo_plus' }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
describe "floats" do
|
6
|
+
before { @result = 1005.42}
|
7
|
+
|
8
|
+
specify { n = "1005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
9
|
+
specify { n = "1005,42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
10
|
+
specify { n = "1,005,420"; Normalizer.to_numeric(n).should be_close(1005420.0, 0.001) }
|
11
|
+
|
12
|
+
specify { n = "1 005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
13
|
+
specify { n = "1,005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
14
|
+
specify { n = "1'005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
15
|
+
specify { n = "1 a 005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
16
|
+
specify { n = "$ 1005.42"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
17
|
+
specify { n = "1'005.420,000,251"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
18
|
+
end
|
19
|
+
describe "ints" do
|
20
|
+
before { @result = 1005.0}
|
21
|
+
specify { n = "1005"; Normalizer.to_numeric(n).should be_close(@result, 0.001) }
|
22
|
+
specify { n = "1,005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
23
|
+
specify { n = "1.005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
24
|
+
specify { n = "1'005"; lambda { Normalizer.to_numeric(n).should be_close(@result, 0.001) }.should raise_error }
|
25
|
+
|
26
|
+
specify { Normalizer.to_numeric("1,000", :separator => '.').should be_close(1000.0, 0.01) }
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
module ScrapedResource
|
5
|
+
describe Row, '#normalize_value' do
|
6
|
+
describe "one normalizer" do
|
7
|
+
before { @row = Row.new(nil,nil,nil) }
|
8
|
+
specify { @row.normalize_value([:numeric], "145.01").should == 145.01 }
|
9
|
+
end
|
10
|
+
describe "chained normalizers" do
|
11
|
+
before do
|
12
|
+
@row = Row.new(nil,nil,nil)
|
13
|
+
@row.should_receive(:to_foo).and_return(10)
|
14
|
+
end
|
15
|
+
specify { @row.normalize_value([:numeric, :foo], "145.01").should == 10 }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'init'
|
2
|
+
#require 'lib/scraped_resource'
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
def check_results(path, klass)
|
7
|
+
Dir[path].each do |results_file|
|
8
|
+
source_file = results_file.gsub('.yml', '')
|
9
|
+
next unless File.exists?(source_file)
|
10
|
+
|
11
|
+
describe results_file do
|
12
|
+
before(:all) do
|
13
|
+
@spider = klass.list_mapper.new(source_file)
|
14
|
+
@arr = @spider.to_a
|
15
|
+
end
|
16
|
+
|
17
|
+
specify {@arr.should_not be_nil}
|
18
|
+
|
19
|
+
results = YAML::load_file(results_file) rescue []
|
20
|
+
puts path if results.empty?
|
21
|
+
results.each_with_index do |result, i|
|
22
|
+
it "#{results_file}row #{i} should be the same" do
|
23
|
+
result.each do |key, expected_value|
|
24
|
+
next if key == :spidered_at
|
25
|
+
# compare string values, otherwise comparing floats with each other sucks
|
26
|
+
parsed_value = @arr[i][key]
|
27
|
+
if parsed_value.is_a?(Float)
|
28
|
+
parsed_value.should be_close(expected_value, 0.0001)
|
29
|
+
else
|
30
|
+
parsed_value.to_s.should ==(expected_value.to_s)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end#describe
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
metadata
ADDED
@@ -0,0 +1,220 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraped_resource
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- hasclass
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-10-01 00:00:00 +02:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: activesupport
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: fastercsv
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - "="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 5
|
44
|
+
segments:
|
45
|
+
- 1
|
46
|
+
- 5
|
47
|
+
- 3
|
48
|
+
version: 1.5.3
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
- !ruby/object:Gem::Dependency
|
52
|
+
name: mechanize
|
53
|
+
prerelease: false
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - "="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 23
|
60
|
+
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
63
|
+
- 0
|
64
|
+
version: 1.0.0
|
65
|
+
type: :runtime
|
66
|
+
version_requirements: *id003
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
name: roo
|
69
|
+
prerelease: false
|
70
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - "="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 53
|
76
|
+
segments:
|
77
|
+
- 1
|
78
|
+
- 9
|
79
|
+
- 3
|
80
|
+
version: 1.9.3
|
81
|
+
type: :runtime
|
82
|
+
version_requirements: *id004
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: google-spreadsheet-ruby
|
85
|
+
prerelease: false
|
86
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
type: :runtime
|
96
|
+
version_requirements: *id005
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubyzip
|
99
|
+
prerelease: false
|
100
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - "="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 51
|
106
|
+
segments:
|
107
|
+
- 0
|
108
|
+
- 9
|
109
|
+
- 4
|
110
|
+
version: 0.9.4
|
111
|
+
type: :runtime
|
112
|
+
version_requirements: *id006
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: spreadsheet
|
115
|
+
prerelease: false
|
116
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - "="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 109
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
- 6
|
125
|
+
- 4
|
126
|
+
- 1
|
127
|
+
version: 0.6.4.1
|
128
|
+
type: :runtime
|
129
|
+
version_requirements: *id007
|
130
|
+
- !ruby/object:Gem::Dependency
|
131
|
+
name: rspec
|
132
|
+
prerelease: false
|
133
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
134
|
+
none: false
|
135
|
+
requirements:
|
136
|
+
- - "="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
hash: 27
|
139
|
+
segments:
|
140
|
+
- 1
|
141
|
+
- 3
|
142
|
+
- 0
|
143
|
+
version: 1.3.0
|
144
|
+
type: :development
|
145
|
+
version_requirements: *id008
|
146
|
+
description: Webscraping framework
|
147
|
+
email: sebi.burkhard@gmail.com
|
148
|
+
executables: []
|
149
|
+
|
150
|
+
extensions: []
|
151
|
+
|
152
|
+
extra_rdoc_files:
|
153
|
+
- LICENSE
|
154
|
+
- README.rdoc
|
155
|
+
files:
|
156
|
+
- .gitignore
|
157
|
+
- LICENSE
|
158
|
+
- README.rdoc
|
159
|
+
- Rakefile
|
160
|
+
- VERSION
|
161
|
+
- scraped_resource.gemspec
|
162
|
+
- spec/models/scraped_resource/attribute_spec.rb
|
163
|
+
- spec/models/scraped_resource/base_spec.rb
|
164
|
+
- spec/models/scraped_resource/csv/list_spec.rb
|
165
|
+
- spec/models/scraped_resource/excel/base_spec.rb
|
166
|
+
- spec/models/scraped_resource/excel/list_spec.rb
|
167
|
+
- spec/models/scraped_resource/excel/show_spec.rb
|
168
|
+
- spec/models/scraped_resource/html/list_spec.rb
|
169
|
+
- spec/models/scraped_resource/normalizer/base_spec.rb
|
170
|
+
- spec/models/scraped_resource/normalizer/numeric_spec.rb
|
171
|
+
- spec/models/scraped_resource/row_spec.rb
|
172
|
+
- spec/models/scraped_resource/util_spec.rb
|
173
|
+
- spec/spec_helper.rb
|
174
|
+
has_rdoc: true
|
175
|
+
homepage: http://github.com/hasclass/scraped_resource
|
176
|
+
licenses: []
|
177
|
+
|
178
|
+
post_install_message:
|
179
|
+
rdoc_options:
|
180
|
+
- --charset=UTF-8
|
181
|
+
require_paths:
|
182
|
+
- lib
|
183
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
184
|
+
none: false
|
185
|
+
requirements:
|
186
|
+
- - ">="
|
187
|
+
- !ruby/object:Gem::Version
|
188
|
+
hash: 3
|
189
|
+
segments:
|
190
|
+
- 0
|
191
|
+
version: "0"
|
192
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ">="
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
hash: 3
|
198
|
+
segments:
|
199
|
+
- 0
|
200
|
+
version: "0"
|
201
|
+
requirements: []
|
202
|
+
|
203
|
+
rubyforge_project:
|
204
|
+
rubygems_version: 1.3.7
|
205
|
+
signing_key:
|
206
|
+
specification_version: 3
|
207
|
+
summary: Webscraping framework
|
208
|
+
test_files:
|
209
|
+
- spec/models/scraped_resource/attribute_spec.rb
|
210
|
+
- spec/models/scraped_resource/base_spec.rb
|
211
|
+
- spec/models/scraped_resource/csv/list_spec.rb
|
212
|
+
- spec/models/scraped_resource/excel/base_spec.rb
|
213
|
+
- spec/models/scraped_resource/excel/list_spec.rb
|
214
|
+
- spec/models/scraped_resource/excel/show_spec.rb
|
215
|
+
- spec/models/scraped_resource/html/list_spec.rb
|
216
|
+
- spec/models/scraped_resource/normalizer/base_spec.rb
|
217
|
+
- spec/models/scraped_resource/normalizer/numeric_spec.rb
|
218
|
+
- spec/models/scraped_resource/row_spec.rb
|
219
|
+
- spec/models/scraped_resource/util_spec.rb
|
220
|
+
- spec/spec_helper.rb
|