yasf 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.rvmrc +52 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +22 -0
- data/README.md +45 -0
- data/Rakefile +7 -0
- data/lib/yasf/scraper.rb +144 -0
- data/lib/yasf/version.rb +3 -0
- data/lib/yasf.rb +15 -0
- data/spec/fixtures/.gitkeep +0 -0
- data/spec/fixtures/advanced_example_response +41 -0
- data/spec/fixtures/basic_example_response +10 -0
- data/spec/fixtures/medium_example_response +13 -0
- data/spec/fixtures/thepiratebay_response.html +510 -0
- data/spec/lib/yasf/.gitkeep +0 -0
- data/spec/lib/yasf/scraper_spec.rb +18 -0
- data/spec/lib/yasf_spec.rb +100 -0
- data/spec/spec_helper.rb +24 -0
- data/yasf.gemspec +25 -0
- metadata +118 -0
@@ -0,0 +1,18 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Yasf::Scraper do
|
4
|
+
|
5
|
+
describe ".result" do
|
6
|
+
it "raises without arguments" do
|
7
|
+
lambda {
|
8
|
+
Yasf::Scraper::result
|
9
|
+
}.should raise_error(ArgumentError, "one symbol to return the value of this accessor")
|
10
|
+
end
|
11
|
+
|
12
|
+
it "Return a proc with two or more symbols" do
|
13
|
+
Yasf::Scraper::result(:one, :tow, :three).should be_a(Proc)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Yasf do
|
4
|
+
it "should be created new scraper without block" do
|
5
|
+
scraper = Yasf::define
|
6
|
+
scraper.should_not be_nil
|
7
|
+
scraper.ancestors.should be_include(Yasf::Scraper)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be raised error sintax with wrong selector" do
|
11
|
+
url = "http://www.fakeurl.com/basic_example"
|
12
|
+
|
13
|
+
scraper = Yasf::define do
|
14
|
+
scrape :title, "2h1.title", :title => :text
|
15
|
+
result :title
|
16
|
+
end
|
17
|
+
|
18
|
+
lambda {
|
19
|
+
title = scraper.extract_from(url)
|
20
|
+
}.should raise_error(Nokogiri::CSS::SyntaxError)
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
it "scrape basic example content" do
|
25
|
+
url = "http://www.fakeurl.com/basic_example"
|
26
|
+
|
27
|
+
scraper = Yasf::define do
|
28
|
+
scrape :title, "h1.title", :title => :text
|
29
|
+
result :title
|
30
|
+
end
|
31
|
+
|
32
|
+
title = scraper.extract_from(url)
|
33
|
+
title.should be_eql("Title 1")
|
34
|
+
end
|
35
|
+
|
36
|
+
it "scrape basic example content with not found selector should be return nil" do
|
37
|
+
url = "http://www.fakeurl.com/basic_example"
|
38
|
+
|
39
|
+
scraper = Yasf::define do
|
40
|
+
scrape :title, "h1.not_found_element", :title => :text
|
41
|
+
result :title
|
42
|
+
end
|
43
|
+
|
44
|
+
title = scraper.extract_from(url)
|
45
|
+
title.should be_eql(nil)
|
46
|
+
end
|
47
|
+
|
48
|
+
it "scrape medium example content and result should be stored in array" do
|
49
|
+
url = "http://www.fakeurl.com/medium_example"
|
50
|
+
scraper = Yasf::define do
|
51
|
+
scrape :title, "h1.title", :'titles[]' => :text
|
52
|
+
result :titles
|
53
|
+
end
|
54
|
+
titles = scraper.extract_from(url)
|
55
|
+
titles.should be_is_a(Array)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "scrape advanced example content and should be to have more results" do
|
59
|
+
url = "http://www.fakeurl.com/advanced_example"
|
60
|
+
|
61
|
+
title = Yasf::define do
|
62
|
+
scrape :title, "h1.title_under_table", :title => :text
|
63
|
+
scrape :links, "a.title_under_table", :link_name => :text, :link_url => :href
|
64
|
+
|
65
|
+
result :title, :link_name, :link_url
|
66
|
+
end
|
67
|
+
scraper = Yasf::define do
|
68
|
+
scrape :titles, "table tr.tr_with_title", :'titles[]' => title
|
69
|
+
|
70
|
+
result :titles
|
71
|
+
end
|
72
|
+
|
73
|
+
titles = scraper.extract_from(url)
|
74
|
+
titles.should be_is_a(Array)
|
75
|
+
titles.size.should be_equal(5)
|
76
|
+
titles[3].title.should be_eql("Title 4")
|
77
|
+
end
|
78
|
+
|
79
|
+
it "scrape thepiratebay.se should be to have more results" do
|
80
|
+
url = "http://thepiratebay.se/browse/101"
|
81
|
+
|
82
|
+
album = Yasf::define do
|
83
|
+
scrape :name, "td div.detName a", :name => :text
|
84
|
+
scrape :desc, "td font.detDesc", :desc => :text
|
85
|
+
|
86
|
+
result :name, :desc
|
87
|
+
end
|
88
|
+
|
89
|
+
scraper = Yasf::define do
|
90
|
+
scrape :albums, "table#searchResult tbody tr", :'albums[]' => album
|
91
|
+
|
92
|
+
result :albums
|
93
|
+
end
|
94
|
+
albums = scraper.extract_from(url)
|
95
|
+
albums.should be_is_a(Array)
|
96
|
+
albums.size.should be_equal(31)
|
97
|
+
albums[29].name.should be_eql("Maurice Ravel Complete Piano Works 2CD")
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'yasf'
|
2
|
+
|
3
|
+
def read_fixture(path)
|
4
|
+
File.read(File.expand_path(File.join(File.dirname(__FILE__), "fixtures", path)))
|
5
|
+
end
|
6
|
+
|
7
|
+
FAKE_URLS = {
|
8
|
+
"http://www.fakeurl.com/basic_example" => "basic_example_response",
|
9
|
+
"http://www.fakeurl.com/medium_example" => "medium_example_response",
|
10
|
+
"http://www.fakeurl.com/advanced_example" => "advanced_example_response",
|
11
|
+
"http://thepiratebay.se/browse/101" => "thepiratebay_response.html"
|
12
|
+
}
|
13
|
+
|
14
|
+
begin
|
15
|
+
require 'fakeweb'
|
16
|
+
|
17
|
+
FakeWeb.allow_net_connect = false
|
18
|
+
FAKE_URLS.each do |url, response|
|
19
|
+
FakeWeb.register_uri(:get, url, :body => read_fixture(response))
|
20
|
+
end
|
21
|
+
rescue LoadError
|
22
|
+
puts "Could not load FakeWeb. Please run 'bundle install'"
|
23
|
+
end
|
24
|
+
|
data/yasf.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "yasf/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "yasf"
|
7
|
+
s.version = Yasf::VERSION
|
8
|
+
s.authors = ["Algonauti"]
|
9
|
+
s.email = ["devel@algonauti.com"]
|
10
|
+
s.homepage = "https://github.com/algonauti/yasf"
|
11
|
+
s.summary = %q{Uses DSL to write easy, maintainable HTML scraping rules.}
|
12
|
+
s.description = %q{HTML scraping to write maintainable rules to extract data from HTML content.}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_dependency('nokogiri', '1.5.5')
|
20
|
+
|
21
|
+
s.add_development_dependency 'rake'
|
22
|
+
s.add_development_dependency 'rspec'
|
23
|
+
s.add_development_dependency 'fakeweb'
|
24
|
+
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: yasf
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Algonauti
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &2151806380 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - =
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.5.5
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *2151806380
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rake
|
27
|
+
requirement: &2151805860 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2151805860
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rspec
|
38
|
+
requirement: &2151805300 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2151805300
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: fakeweb
|
49
|
+
requirement: &2151804720 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *2151804720
|
58
|
+
description: HTML scraping to write maintainable rules to extract data from HTML content.
|
59
|
+
email:
|
60
|
+
- devel@algonauti.com
|
61
|
+
executables: []
|
62
|
+
extensions: []
|
63
|
+
extra_rdoc_files: []
|
64
|
+
files:
|
65
|
+
- .gitignore
|
66
|
+
- .rvmrc
|
67
|
+
- Gemfile
|
68
|
+
- Gemfile.lock
|
69
|
+
- LICENSE.txt
|
70
|
+
- README.md
|
71
|
+
- Rakefile
|
72
|
+
- lib/yasf.rb
|
73
|
+
- lib/yasf/scraper.rb
|
74
|
+
- lib/yasf/version.rb
|
75
|
+
- spec/fixtures/.gitkeep
|
76
|
+
- spec/fixtures/advanced_example_response
|
77
|
+
- spec/fixtures/basic_example_response
|
78
|
+
- spec/fixtures/medium_example_response
|
79
|
+
- spec/fixtures/thepiratebay_response.html
|
80
|
+
- spec/lib/yasf/.gitkeep
|
81
|
+
- spec/lib/yasf/scraper_spec.rb
|
82
|
+
- spec/lib/yasf_spec.rb
|
83
|
+
- spec/spec_helper.rb
|
84
|
+
- yasf.gemspec
|
85
|
+
homepage: https://github.com/algonauti/yasf
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project:
|
105
|
+
rubygems_version: 1.8.15
|
106
|
+
signing_key:
|
107
|
+
specification_version: 3
|
108
|
+
summary: Uses DSL to write easy, maintainable HTML scraping rules.
|
109
|
+
test_files:
|
110
|
+
- spec/fixtures/.gitkeep
|
111
|
+
- spec/fixtures/advanced_example_response
|
112
|
+
- spec/fixtures/basic_example_response
|
113
|
+
- spec/fixtures/medium_example_response
|
114
|
+
- spec/fixtures/thepiratebay_response.html
|
115
|
+
- spec/lib/yasf/.gitkeep
|
116
|
+
- spec/lib/yasf/scraper_spec.rb
|
117
|
+
- spec/lib/yasf_spec.rb
|
118
|
+
- spec/spec_helper.rb
|