scrapify 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in apify.gemspec
4
+ gemspec
5
+ gem 'rake'
6
+ gem 'rspec'
7
+ gem 'mocha'
8
+ gem 'fakeweb'
9
+ gem 'active_support'
10
+ gem 'nokogiri'
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ ## ScrApify
2
+
3
+ ScrApify is a library to build APIs by scraping static sites with an ActiveRecord like querying interface
4
+
5
+ ### Installation
6
+
7
+ ```
8
+ $ gem install scrapify
9
+ ```
10
+
11
+ ### Usage
12
+
13
+ Define html url and declare attributes using xpath or css selectors.
14
+ Scrapify classes must have a key attribute defined.
15
+
16
+ ```
17
+ class Pizza
18
+ include Scrapify::Base
19
+ html "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
20
+
21
+ attribute :name, css: ".menu_lft li a"
22
+ attribute :image_url, xpath: "//li//input//@value"
23
+
24
+ key :name
25
+ end
26
+ ```
27
+
28
+ Now you can use finder methods to extract data from a static site
29
+
30
+ ```
31
+ > Pizza.all
32
+
33
+ > pizza = Pizza.find('mushroom')
34
+ > pizza.name
35
+ > pizza.image_url
36
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ task :default => :spec
@@ -0,0 +1,5 @@
1
+ class Object # http://whytheluckystiff.net/articles/seeingMetaclassesClearly.html
2
+ def meta_define name, &blk
3
+ (class << self; self; end).instance_eval { define_method name, &blk }
4
+ end
5
+ end
@@ -0,0 +1,76 @@
1
+ module Scrapify
2
+ module Base
3
+ def self.included(klass)
4
+ klass.extend ClassMethods
5
+ klass.cattr_accessor :url, :doc, :attribute_names
6
+ end
7
+
8
+ module ClassMethods
9
+ def html(url)
10
+ self.url = url
11
+ define_finders
12
+ end
13
+
14
+ def attribute(name, options={})
15
+ add_attribute(name)
16
+ parser = options[:xpath] ? :xpath : :css
17
+ selector = options[parser]
18
+ meta_define "#{name}_values" do
19
+ self.doc ||= parse_html
20
+ self.doc.send(parser, selector).map &:content
21
+ end
22
+ end
23
+
24
+ def key(attribute)
25
+ define_find_by_id attribute
26
+ define_count attribute
27
+ end
28
+
29
+ private
30
+
31
+ def add_attribute(name)
32
+ self.attribute_names ||= []
33
+ self.attribute_names << name
34
+ end
35
+
36
+ def parse_html
37
+ Nokogiri::HTML(open(url))
38
+ end
39
+
40
+ def define_finders
41
+ meta_define :all do
42
+ count.times.map do |index|
43
+ find_by_index index
44
+ end
45
+ end
46
+
47
+ meta_define :first do
48
+ find_by_index 0
49
+ end
50
+
51
+ meta_define :last do
52
+ find_by_index count - 1
53
+ end
54
+
55
+ meta_define :find_by_index do |index|
56
+ return if index.nil? or index < 0
57
+ attributes = Hash[attribute_names.map {|attribute| [attribute, send("#{attribute}_values")[index]]}]
58
+ OpenStruct.new(attributes)
59
+ end
60
+ end
61
+
62
+ def define_count(key_attribute)
63
+ meta_define :count do
64
+ send("#{key_attribute}_values").size
65
+ end
66
+ end
67
+
68
+ def define_find_by_id(key_attribute)
69
+ meta_define :find do |key_value|
70
+ index = send("#{key_attribute}_values").index(key_value)
71
+ find_by_index index
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,3 @@
1
+ module Scrapify
2
+ VERSION = "0.0.1"
3
+ end
data/lib/scrapify.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'scrapify/version'
2
+ require 'active_support/core_ext/class/attribute_accessors'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'meta_define'
6
+ require 'ostruct'
7
+ require 'scrapify/base'
data/scrapify.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "scrapify/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "scrapify"
7
+ s.version = Scrapify::VERSION
8
+ s.authors = ["Sathish & Shakiel"]
9
+ s.email = ["sathish316@gmail.com"]
10
+ s.homepage = "http://www.github.com/sathish316/scrapify"
11
+ s.summary = %q{ScrApify scraps static html sites to scraESTlike APIs}
12
+ s.description = %q{ScrApify scraps static html sites to RESTlike APIs}
13
+
14
+ s.rubyforge_project = "scrapify"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ s.add_development_dependency "mocha"
24
+ s.add_development_dependency "fakeweb"
25
+ # s.add_runtime_dependency "nokogiri"
26
+ end
data/spec/pizza.rb ADDED
@@ -0,0 +1,9 @@
1
+ class Pizza
2
+ include Scrapify::Base
3
+ html "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
4
+
5
+ attribute :name, css: ".menu_lft li a"
6
+ attribute :image_url, xpath: "//li//input//@value"
7
+
8
+ key :name
9
+ end
@@ -0,0 +1,79 @@
1
+ require 'spec_helper'
2
+ require 'test_models'
3
+
4
+ describe Scrapify do
5
+
6
+ before do
7
+ @pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
8
+ FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
9
+ <ul class="menu_lft">
10
+ <li><a>chicken supreme</a><input value="chicken.jpg"></li>
11
+ <li><a>veg supreme</a><input value="veg.jpg"></li>
12
+ <li><a>pepperoni</a><input value="pepperoni.jpg"></li>
13
+ </ul>
14
+ HTML
15
+ end
16
+
17
+ it "should return attribute names" do
18
+ ::Pizza.attribute_names.should == [:name, :image_url]
19
+ end
20
+
21
+ describe "html" do
22
+ it "should store url" do
23
+ ::Pizza.url.should == @pizza_url
24
+ end
25
+
26
+ it "should parse html and fetch attributes using css" do
27
+ ::Pizza.name_values.should == ['chicken supreme', 'veg supreme', 'pepperoni']
28
+ end
29
+
30
+ it "should parse html and fetch attributes using xpath" do
31
+ ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
32
+ end
33
+ end
34
+
35
+ describe "find" do
36
+ it "should find element by key" do
37
+ pizza = ::Pizza.find('pepperoni')
38
+ pizza.should_not be_nil
39
+ pizza.name.should == 'pepperoni'
40
+ pizza.image_url.should == 'pepperoni.jpg'
41
+ end
42
+
43
+ it "should be nil if element does not exist" do
44
+ pizza = ::Pizza.find('mushroom')
45
+ pizza.should be_nil
46
+ end
47
+ end
48
+
49
+ describe "first" do
50
+ it "should fetch first matching element" do
51
+ first_pizza = ::Pizza.first
52
+ first_pizza.name.should == 'chicken supreme'
53
+ first_pizza.image_url.should == 'chicken.jpg'
54
+ end
55
+ end
56
+
57
+ describe "last" do
58
+ it "should fetch last matching element" do
59
+ last_pizza = ::Pizza.last
60
+ last_pizza.name.should == 'pepperoni'
61
+ last_pizza.image_url.should == 'pepperoni.jpg'
62
+ end
63
+ end
64
+
65
+ describe "all" do
66
+ it "should fetch all objects" do
67
+ pizzas = ::Pizza.all
68
+ pizzas.size.should == 3
69
+ pizzas.map(&:name).should == ['chicken supreme', 'veg supreme', 'pepperoni']
70
+ pizzas.map(&:image_url).should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
71
+ end
72
+ end
73
+
74
+ describe "count" do
75
+ it "should return number of matching elements" do
76
+ ::Pizza.count.should == 3
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'rspec/mocks'
4
+ require 'fakeweb'
5
+
6
+ require 'scrapify'
7
+
8
+ RSpec.configure do |config|
9
+ config.mock_with :mocha
10
+ end
@@ -0,0 +1 @@
1
+ require 'pizza'
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scrapify
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sathish & Shakiel
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-25 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70282396705740 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70282396705740
25
+ - !ruby/object:Gem::Dependency
26
+ name: mocha
27
+ requirement: &70282396705260 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70282396705260
36
+ - !ruby/object:Gem::Dependency
37
+ name: fakeweb
38
+ requirement: &70282396704820 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70282396704820
47
+ description: ScrApify scraps static html sites to RESTlike APIs
48
+ email:
49
+ - sathish316@gmail.com
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - .gitignore
55
+ - Gemfile
56
+ - README.md
57
+ - Rakefile
58
+ - lib/meta_define.rb
59
+ - lib/scrapify.rb
60
+ - lib/scrapify/base.rb
61
+ - lib/scrapify/version.rb
62
+ - scrapify.gemspec
63
+ - spec/pizza.rb
64
+ - spec/scrapify_spec.rb
65
+ - spec/spec_helper.rb
66
+ - spec/test_models.rb
67
+ homepage: http://www.github.com/sathish316/scrapify
68
+ licenses: []
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ! '>='
77
+ - !ruby/object:Gem::Version
78
+ version: '0'
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ! '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ requirements: []
86
+ rubyforge_project: scrapify
87
+ rubygems_version: 1.8.10
88
+ signing_key:
89
+ specification_version: 3
90
+ summary: ScrApify scraps static html sites to scraESTlike APIs
91
+ test_files:
92
+ - spec/pizza.rb
93
+ - spec/scrapify_spec.rb
94
+ - spec/spec_helper.rb
95
+ - spec/test_models.rb