scrapify 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +10 -0
- data/README.md +36 -0
- data/Rakefile +6 -0
- data/lib/meta_define.rb +5 -0
- data/lib/scrapify/base.rb +76 -0
- data/lib/scrapify/version.rb +3 -0
- data/lib/scrapify.rb +7 -0
- data/scrapify.gemspec +26 -0
- data/spec/pizza.rb +9 -0
- data/spec/scrapify_spec.rb +79 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/test_models.rb +1 -0
- metadata +95 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
## ScrApify
|
2
|
+
|
3
|
+
ScrApify is a library to build APIs by scraping static sites with an ActiveRecord like querying interface
|
4
|
+
|
5
|
+
### Installation
|
6
|
+
|
7
|
+
```
|
8
|
+
$ gem install scrapify
|
9
|
+
```
|
10
|
+
|
11
|
+
### Usage
|
12
|
+
|
13
|
+
Define html url and declare attributes using xpath or css selectors.
|
14
|
+
Scrapify classes must have a key attribute defined.
|
15
|
+
|
16
|
+
```
|
17
|
+
class Pizza
|
18
|
+
include Scrapify::Base
|
19
|
+
html "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
|
20
|
+
|
21
|
+
attribute :name, css: ".menu_lft li a"
|
22
|
+
attribute :image_url, xpath: "//li//input//@value"
|
23
|
+
|
24
|
+
key :name
|
25
|
+
end
|
26
|
+
```
|
27
|
+
|
28
|
+
Now you can use finder methods to extract data from a static site
|
29
|
+
|
30
|
+
```
|
31
|
+
> Pizza.all
|
32
|
+
|
33
|
+
> pizza = Pizza.find('mushroom')
|
34
|
+
> pizza.name
|
35
|
+
> pizza.image_url
|
36
|
+
```
|
data/Rakefile
ADDED
data/lib/meta_define.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
module Scrapify
|
2
|
+
module Base
|
3
|
+
def self.included(klass)
|
4
|
+
klass.extend ClassMethods
|
5
|
+
klass.cattr_accessor :url, :doc, :attribute_names
|
6
|
+
end
|
7
|
+
|
8
|
+
module ClassMethods
|
9
|
+
def html(url)
|
10
|
+
self.url = url
|
11
|
+
define_finders
|
12
|
+
end
|
13
|
+
|
14
|
+
def attribute(name, options={})
|
15
|
+
add_attribute(name)
|
16
|
+
parser = options[:xpath] ? :xpath : :css
|
17
|
+
selector = options[parser]
|
18
|
+
meta_define "#{name}_values" do
|
19
|
+
self.doc ||= parse_html
|
20
|
+
self.doc.send(parser, selector).map &:content
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def key(attribute)
|
25
|
+
define_find_by_id attribute
|
26
|
+
define_count attribute
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def add_attribute(name)
|
32
|
+
self.attribute_names ||= []
|
33
|
+
self.attribute_names << name
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_html
|
37
|
+
Nokogiri::HTML(open(url))
|
38
|
+
end
|
39
|
+
|
40
|
+
def define_finders
|
41
|
+
meta_define :all do
|
42
|
+
count.times.map do |index|
|
43
|
+
find_by_index index
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
meta_define :first do
|
48
|
+
find_by_index 0
|
49
|
+
end
|
50
|
+
|
51
|
+
meta_define :last do
|
52
|
+
find_by_index count - 1
|
53
|
+
end
|
54
|
+
|
55
|
+
meta_define :find_by_index do |index|
|
56
|
+
return if index.nil? or index < 0
|
57
|
+
attributes = Hash[attribute_names.map {|attribute| [attribute, send("#{attribute}_values")[index]]}]
|
58
|
+
OpenStruct.new(attributes)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def define_count(key_attribute)
|
63
|
+
meta_define :count do
|
64
|
+
send("#{key_attribute}_values").size
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def define_find_by_id(key_attribute)
|
69
|
+
meta_define :find do |key_value|
|
70
|
+
index = send("#{key_attribute}_values").index(key_value)
|
71
|
+
find_by_index index
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
data/lib/scrapify.rb
ADDED
data/scrapify.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "scrapify/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "scrapify"
|
7
|
+
s.version = Scrapify::VERSION
|
8
|
+
s.authors = ["Sathish & Shakiel"]
|
9
|
+
s.email = ["sathish316@gmail.com"]
|
10
|
+
s.homepage = "http://www.github.com/sathish316/scrapify"
|
11
|
+
s.summary = %q{ScrApify scraps static html sites to scraESTlike APIs}
|
12
|
+
s.description = %q{ScrApify scraps static html sites to RESTlike APIs}
|
13
|
+
|
14
|
+
s.rubyforge_project = "scrapify"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
s.add_development_dependency "mocha"
|
24
|
+
s.add_development_dependency "fakeweb"
|
25
|
+
# s.add_runtime_dependency "nokogiri"
|
26
|
+
end
|
data/spec/pizza.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'test_models'
|
3
|
+
|
4
|
+
describe Scrapify do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
|
8
|
+
FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
|
9
|
+
<ul class="menu_lft">
|
10
|
+
<li><a>chicken supreme</a><input value="chicken.jpg"></li>
|
11
|
+
<li><a>veg supreme</a><input value="veg.jpg"></li>
|
12
|
+
<li><a>pepperoni</a><input value="pepperoni.jpg"></li>
|
13
|
+
</ul>
|
14
|
+
HTML
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should return attribute names" do
|
18
|
+
::Pizza.attribute_names.should == [:name, :image_url]
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "html" do
|
22
|
+
it "should store url" do
|
23
|
+
::Pizza.url.should == @pizza_url
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should parse html and fetch attributes using css" do
|
27
|
+
::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni']
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should parse html and fetch attributes using xpath" do
|
31
|
+
::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "find" do
|
36
|
+
it "should find element by key" do
|
37
|
+
pizza = ::Pizza.find('pepperoni')
|
38
|
+
pizza.should_not be_nil
|
39
|
+
pizza.name.should == 'pepperoni'
|
40
|
+
pizza.image_url.should == 'pepperoni.jpg'
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should be nil if element does not exist" do
|
44
|
+
pizza = ::Pizza.find('mushroom')
|
45
|
+
pizza.should be_nil
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "first" do
|
50
|
+
it "should fetch first matching element" do
|
51
|
+
first_pizza = ::Pizza.first
|
52
|
+
first_pizza.name.should == 'chicken supreme'
|
53
|
+
first_pizza.image_url.should == 'chicken.jpg'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe "last" do
|
58
|
+
it "should fetch last matching element" do
|
59
|
+
last_pizza = ::Pizza.last
|
60
|
+
last_pizza.name.should == 'pepperoni'
|
61
|
+
last_pizza.image_url.should == 'pepperoni.jpg'
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe "all" do
|
66
|
+
it "should fetch all objects" do
|
67
|
+
pizzas = ::Pizza.all
|
68
|
+
pizzas.size.should == 3
|
69
|
+
pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni']
|
70
|
+
pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe "count" do
|
75
|
+
it "should return number of matching elements" do
|
76
|
+
::Pizza.count.should == 3
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/test_models.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'pizza'
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scrapify
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sathish & Shakiel
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-05-25 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70282396705740 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70282396705740
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mocha
|
27
|
+
requirement: &70282396705260 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70282396705260
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: fakeweb
|
38
|
+
requirement: &70282396704820 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70282396704820
|
47
|
+
description: ScrApify scraps static html sites to RESTlike APIs
|
48
|
+
email:
|
49
|
+
- sathish316@gmail.com
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- .gitignore
|
55
|
+
- Gemfile
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- lib/meta_define.rb
|
59
|
+
- lib/scrapify.rb
|
60
|
+
- lib/scrapify/base.rb
|
61
|
+
- lib/scrapify/version.rb
|
62
|
+
- scrapify.gemspec
|
63
|
+
- spec/pizza.rb
|
64
|
+
- spec/scrapify_spec.rb
|
65
|
+
- spec/spec_helper.rb
|
66
|
+
- spec/test_models.rb
|
67
|
+
homepage: http://www.github.com/sathish316/scrapify
|
68
|
+
licenses: []
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
76
|
+
- - ! '>='
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
version: '0'
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ! '>='
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
requirements: []
|
86
|
+
rubyforge_project: scrapify
|
87
|
+
rubygems_version: 1.8.10
|
88
|
+
signing_key:
|
89
|
+
specification_version: 3
|
90
|
+
summary: ScrApify scraps static html sites to scraESTlike APIs
|
91
|
+
test_files:
|
92
|
+
- spec/pizza.rb
|
93
|
+
- spec/scrapify_spec.rb
|
94
|
+
- spec/spec_helper.rb
|
95
|
+
- spec/test_models.rb
|