scrapify 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in apify.gemspec
4
+ gemspec
5
+ gem 'rake'
6
+ gem 'rspec'
7
+ gem 'mocha'
8
+ gem 'fakeweb'
9
+ gem 'active_support'
10
+ gem 'nokogiri'
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ ## ScrApify
2
+
3
+ ScrApify is a library to build APIs by scraping static sites with an ActiveRecord like querying interface
4
+
5
+ ### Installation
6
+
7
+ ```
8
+ $ gem install scrapify
9
+ ```
10
+
11
+ ### Usage
12
+
13
+ Define html url and declare attributes using xpath or css selectors.
14
+ Scrapify classes must have a key attribute defined.
15
+
16
+ ```
17
+ class Pizza
18
+ include Scrapify::Base
19
+ html "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
20
+
21
+ attribute :name, css: ".menu_lft li a"
22
+ attribute :image_url, xpath: "//li//input//@value"
23
+
24
+ key :name
25
+ end
26
+ ```
27
+
28
+ Now you can use finder methods to extract data from a static site
29
+
30
+ ```
31
+ > Pizza.all
32
+
33
+ > pizza = Pizza.find('mushroom')
34
+ > pizza.name
35
+ > pizza.image_url
36
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ task :default => :spec
@@ -0,0 +1,5 @@
1
+ class Object # http://whytheluckystiff.net/articles/seeingMetaclassesClearly.html
2
+ def meta_define name, &blk
3
+ (class << self; self; end).instance_eval { define_method name, &blk }
4
+ end
5
+ end
@@ -0,0 +1,76 @@
1
+ module Scrapify
2
+ module Base
3
+ def self.included(klass)
4
+ klass.extend ClassMethods
5
+ klass.cattr_accessor :url, :doc, :attribute_names
6
+ end
7
+
8
+ module ClassMethods
9
+ def html(url)
10
+ self.url = url
11
+ define_finders
12
+ end
13
+
14
+ def attribute(name, options={})
15
+ add_attribute(name)
16
+ parser = options[:xpath] ? :xpath : :css
17
+ selector = options[parser]
18
+ meta_define "#{name}_values" do
19
+ self.doc ||= parse_html
20
+ self.doc.send(parser, selector).map &:content
21
+ end
22
+ end
23
+
24
+ def key(attribute)
25
+ define_find_by_id attribute
26
+ define_count attribute
27
+ end
28
+
29
+ private
30
+
31
+ def add_attribute(name)
32
+ self.attribute_names ||= []
33
+ self.attribute_names << name
34
+ end
35
+
36
+ def parse_html
37
+ Nokogiri::HTML(open(url))
38
+ end
39
+
40
+ def define_finders
41
+ meta_define :all do
42
+ count.times.map do |index|
43
+ find_by_index index
44
+ end
45
+ end
46
+
47
+ meta_define :first do
48
+ find_by_index 0
49
+ end
50
+
51
+ meta_define :last do
52
+ find_by_index count - 1
53
+ end
54
+
55
+ meta_define :find_by_index do |index|
56
+ return if index.nil? or index < 0
57
+ attributes = Hash[attribute_names.map {|attribute| [attribute, send("#{attribute}_values")[index]]}]
58
+ OpenStruct.new(attributes)
59
+ end
60
+ end
61
+
62
+ def define_count(key_attribute)
63
+ meta_define :count do
64
+ send("#{key_attribute}_values").size
65
+ end
66
+ end
67
+
68
+ def define_find_by_id(key_attribute)
69
+ meta_define :find do |key_value|
70
+ index = send("#{key_attribute}_values").index(key_value)
71
+ find_by_index index
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,3 @@
1
+ module Scrapify
2
+ VERSION = "0.0.1"
3
+ end
data/lib/scrapify.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'scrapify/version'
2
+ require 'active_support/core_ext/class/attribute_accessors'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'meta_define'
6
+ require 'ostruct'
7
+ require 'scrapify/base'
data/scrapify.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "scrapify/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "scrapify"
7
+ s.version = Scrapify::VERSION
8
+ s.authors = ["Sathish & Shakiel"]
9
+ s.email = ["sathish316@gmail.com"]
10
+ s.homepage = "http://www.github.com/sathish316/scrapify"
11
+ s.summary = %q{ScrApify scraps static html sites to scraESTlike APIs}
12
+ s.description = %q{ScrApify scraps static html sites to RESTlike APIs}
13
+
14
+ s.rubyforge_project = "scrapify"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ s.add_development_dependency "mocha"
24
+ s.add_development_dependency "fakeweb"
25
+ # s.add_runtime_dependency "nokogiri"
26
+ end
data/spec/pizza.rb ADDED
@@ -0,0 +1,9 @@
1
+ class Pizza
2
+ include Scrapify::Base
3
+ html "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
4
+
5
+ attribute :name, css: ".menu_lft li a"
6
+ attribute :image_url, xpath: "//li//input//@value"
7
+
8
+ key :name
9
+ end
@@ -0,0 +1,79 @@
1
+ require 'spec_helper'
2
+ require 'test_models'
3
+
4
+ describe Scrapify do
5
+
6
+ before do
7
+ @pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
8
+ FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
9
+ <ul class="menu_lft">
10
+ <li><a>chicken supreme</a><input value="chicken.jpg"></li>
11
+ <li><a>veg supreme</a><input value="veg.jpg"></li>
12
+ <li><a>pepperoni</a><input value="pepperoni.jpg"></li>
13
+ </ul>
14
+ HTML
15
+ end
16
+
17
+ it "should return attribute names" do
18
+ ::Pizza.attribute_names.should == [:name, :image_url]
19
+ end
20
+
21
+ describe "html" do
22
+ it "should store url" do
23
+ ::Pizza.url.should == @pizza_url
24
+ end
25
+
26
+ it "should parse html and fetch attributes using css" do
27
+ ::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni']
28
+ end
29
+
30
+ it "should parse html and fetch attributes using xpath" do
31
+ ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
32
+ end
33
+ end
34
+
35
+ describe "find" do
36
+ it "should find element by key" do
37
+ pizza = ::Pizza.find('pepperoni')
38
+ pizza.should_not be_nil
39
+ pizza.name.should == 'pepperoni'
40
+ pizza.image_url.should == 'pepperoni.jpg'
41
+ end
42
+
43
+ it "should be nil if element does not exist" do
44
+ pizza = ::Pizza.find('mushroom')
45
+ pizza.should be_nil
46
+ end
47
+ end
48
+
49
+ describe "first" do
50
+ it "should fetch first matching element" do
51
+ first_pizza = ::Pizza.first
52
+ first_pizza.name.should == 'chicken supreme'
53
+ first_pizza.image_url.should == 'chicken.jpg'
54
+ end
55
+ end
56
+
57
+ describe "last" do
58
+ it "should fetch last matching element" do
59
+ last_pizza = ::Pizza.last
60
+ last_pizza.name.should == 'pepperoni'
61
+ last_pizza.image_url.should == 'pepperoni.jpg'
62
+ end
63
+ end
64
+
65
+ describe "all" do
66
+ it "should fetch all objects" do
67
+ pizzas = ::Pizza.all
68
+ pizzas.size.should == 3
69
+ pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni']
70
+ pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
71
+ end
72
+ end
73
+
74
+ describe "count" do
75
+ it "should return number of matching elements" do
76
+ ::Pizza.count.should == 3
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'rspec/mocks'
4
+ require 'fakeweb'
5
+
6
+ require 'scrapify'
7
+
8
+ RSpec.configure do |config|
9
+ config.mock_with :mocha
10
+ end
@@ -0,0 +1 @@
1
+ require 'pizza'
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapify
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sathish & Shakiel
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-25 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70282396705740 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70282396705740
25
+ - !ruby/object:Gem::Dependency
26
+ name: mocha
27
+ requirement: &70282396705260 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70282396705260
36
+ - !ruby/object:Gem::Dependency
37
+ name: fakeweb
38
+ requirement: &70282396704820 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70282396704820
47
+ description: ScrApify scraps static html sites to RESTlike APIs
48
+ email:
49
+ - sathish316@gmail.com
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - .gitignore
55
+ - Gemfile
56
+ - README.md
57
+ - Rakefile
58
+ - lib/meta_define.rb
59
+ - lib/scrapify.rb
60
+ - lib/scrapify/base.rb
61
+ - lib/scrapify/version.rb
62
+ - scrapify.gemspec
63
+ - spec/pizza.rb
64
+ - spec/scrapify_spec.rb
65
+ - spec/spec_helper.rb
66
+ - spec/test_models.rb
67
+ homepage: http://www.github.com/sathish316/scrapify
68
+ licenses: []
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ! '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ requirements: []
86
+ rubyforge_project: scrapify
87
+ rubygems_version: 1.8.10
88
+ signing_key:
89
+ specification_version: 3
90
+ summary: ScrApify scraps static html sites to scraESTlike APIs
91
+ test_files:
92
+ - spec/pizza.rb
93
+ - spec/scrapify_spec.rb
94
+ - spec/spec_helper.rb
95
+ - spec/test_models.rb