RubyGems - scrapify - Versions diffs - 0.0.4 → 0.0.5 - Mend

scrapify 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/README.md CHANGED Viewed

@@ -8,6 +8,12 @@ ScrApify is a library to build APIs by scraping static sites and use data as mod
 $ gem install scrapify
 ```
+If you're using Bundler, add this to your Gemfile:
+```
+gem 'scrapify'
+```
 ### Usage
 Define html url and declare attributes using xpath or css selectors.
@@ -63,6 +69,34 @@ This will respond to two urls index and show with JSON:
 Jsonify currently has a limitation where the URLs /pizzas.json and /pizzas/1.json cannot be matched by the same map entry in Rack routes
-### JSON API (Rails application example)
+### JSON API (Rails example)
+Scrapify comes with a Rack application called Jsonify which can be used in rails routes to expose scraped models as JSON.
+Check out this [Rails example](https://github.com/sathish316/jsonify_rails_example) for more details:
+https://github.com/sathish316/jsonify_rails_example
+1 Add scrapify to Gemfile
+```
+gem 'scrapify'
+```
+2 Define model to scrap data in app/models
+```
+class Pizza
+  include Scrapify::Base
+end
+```
+3 Add index and show API to routes
+```
+  pizza_api = Jsonify.new('/pizzas', Pizza)
+  get 'pizzas' => pizza_api
+  get 'pizzas/:id' => pizza_api
+```
-TODO
+Jsonify scraps url and exposes index and show urls as JSON APIs

data/lib/jsonify.rb CHANGED Viewed

@@ -7,7 +7,8 @@ class Jsonify
   def call(env)
     path = env['REQUEST_PATH']
     response = path == @route ? all : one(find_id(path))
-    [200, {"Content-Type" => "application/json"}, [response]]
+    header = @model.http_cache_header.merge("Content-Type" => "application/json")
+    [200, header, [response]]
   end
   private
@@ -23,4 +24,4 @@ class Jsonify
   def find_id(path)
     path[path.rindex('/') + 1, path.size]
   end
-end
+end

data/lib/scrapify/base.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module Scrapify
   module Base
+    HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
     def self.included(klass)
       klass.extend ClassMethods
       klass.cattr_accessor :url, :doc, :attribute_names
@@ -28,9 +29,19 @@ module Scrapify
         add_attribute(name)
         parser = options[:xpath] ? :xpath : :css
         selector = options[parser]
+        matcher = /#{options[:regex]}/ if options[:regex]
+        to_array = options[:array]
         meta_define "#{name}_values" do
           self.doc ||= parse_html
-          self.doc.send(parser, selector).map &:content
+          self.doc.send(parser, selector).map do |element|
+            content = element.content
+            if matcher
+              match_data = content.scan(matcher).map &:first
+              options[:array] ? match_data : match_data.first
+            else
+              content.strip
+            end
+          end
         end
       end
@@ -39,6 +50,12 @@ module Scrapify
         define_count attribute
       end
+      def http_cache_header
+        http_header.select do |(k, v)|
+          HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
+        end
+      end
       private
       def add_attribute(name)
@@ -47,7 +64,21 @@ module Scrapify
       end
       def parse_html
-        Nokogiri::HTML(open(url))
+        Nokogiri::HTML(html_content)
+      end
+      def html_content
+        http_response.body
+      end
+      def http_response
+        @http_response ||= Net::HTTP.get_response URI(url)
+      end
+      def http_header
+        http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
+          hash[k] = v.first
+        end
       end
       def define_finders

data/lib/scrapify/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Scrapify
-  VERSION = "0.0.4"
+  VERSION = "0.0.5"
 end

data/lib/scrapify.rb CHANGED Viewed

@@ -1,7 +1,8 @@
 require 'scrapify/version'
 require 'active_support/core_ext/class/attribute_accessors'
 require 'nokogiri'
-require 'open-uri'
+require 'uri'
+require 'net/http'
 require 'meta_define'
 require 'scrapify/base'
 require 'json'

data/spec/jsonify_spec.rb CHANGED Viewed

@@ -2,6 +2,11 @@ require 'spec_helper'
 require 'test_models'
 describe Jsonify do
+  before do
+    ::Pizza.stubs(:http_cache_header).returns("cache-control" => "private")
+    ::Pizza.stubs(:all).returns([])
+  end
   it "should find all objects and convert to json for index url" do
     pizzas = [{name: 'cheese'}, {name: 'chicken'}]
     ::Pizza.expects(:all).returns(pizzas)
@@ -25,4 +30,12 @@ describe Jsonify do
     header['Content-Type'].should == 'application/json'
     response.first.should == pizza.to_json
   end
-end
+  it "should forward the http cache headers" do
+    ::Pizza.expects(:http_cache_header).returns("cache-control"=>"private")
+    jsonify = Jsonify.new('/pizzas', ::Pizza)
+    status, header, response = jsonify.call('REQUEST_PATH' => '/pizzas')
+    header['cache-control'].should == "private"
+  end
+end

data/spec/pizza.rb CHANGED Viewed

@@ -4,6 +4,8 @@ class Pizza
   attribute :name, css: ".menu_lft li a"
   attribute :image_url, xpath: "//li//input//@value"
+  attribute :price, css: ".price", regex: /([\d\.]+)/
+  attribute :ingredients, css: ".ingredients", regex: /contains (\w+)/, array: true
   key :name
 end

data/spec/scrapify_spec.rb CHANGED Viewed

@@ -5,17 +5,43 @@ describe Scrapify do
   before do
     @pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
-    FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
+    FakeWeb.register_uri :get, @pizza_url,
+                         :cache_control => "private, s-maxage=0, max-age=0, must-revalidate",
+                         :age           => 51592,
+                         :length        => 12312,
+                         :body          => <<-HTML
       <ul class="menu_lft">
-        <li><a>chicken supreme</a><input value="chicken.jpg"></li>
-        <li><a>veg supreme</a><input value="veg.jpg"></li>
-        <li><a>pepperoni</a><input value="pepperoni.jpg"></li>
+        <li>
+          <a>  chicken supreme  </a><input value="chicken.jpg">
+          <span class='price'>(1.23)</span>
+          <span class='ingredients'>
+            <ol>
+              <li>contains corn</li>
+              <li>contains tomato</li>
+            <ol>
+          </span>
+        </li>
+        <li>
+          <a>veg supreme</a><input value="veg.jpg">
+          <span class='price'>(2.34)</span>
+          <span class='ingredients'>
+            <ol>
+              <li>contains mushroom</li>
+              <li>contains jalapeno</li>
+            <ol>
+          </span>
+        </li>
+        <li>
+          <a>pepperoni</a><input value="pepperoni.jpg">
+          <span class='price'>(3.45)</span>
+          <span class='ingredients'></span>
+        </li>
       </ul>
     HTML
   end
   it "should return attribute names" do
-    ::Pizza.attribute_names.should == [:name, :image_url]
+    ::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
   end
   describe "html" do
@@ -30,6 +56,27 @@ describe Scrapify do
     it "should parse html and fetch attributes using xpath" do
       ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
     end
+    it "should parse html and extract attributes using regex" do
+      ::Pizza.price_values.should == ['1.23', '2.34', '3.45']
+    end
+    it "should parse html and extract multiple attributes using regex" do
+      ::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
+    end
+    it "should strip content" do
+      ::Pizza.first.name.should == 'chicken supreme'
+    end
+    describe "cache headers" do
+      it "should return the http headers" do
+        ::Pizza.http_cache_header.should == {
+           "cache-control" => "private, s-maxage=0, max-age=0, must-revalidate",
+           "age"           => 51592,
+        }
+      end
+    end
   end
   describe "find" do
@@ -80,14 +127,33 @@ describe Scrapify do
   describe "attributes" do
     it "should return attributes hash" do
       first_pizza = ::Pizza.first
-      first_pizza.attributes.should == {name: "chicken supreme", image_url: "chicken.jpg"}
+      first_pizza.attributes.should == {
+        name: "chicken supreme",
+        image_url: "chicken.jpg",
+        price: '1.23',
+        ingredients: ['corn', 'tomato']
+      }
     end
   end
   describe "to_json" do
     it "should convert attributes to json" do
       first_pizza = ::Pizza.first
-      first_pizza.to_json.should == {name: "chicken supreme", image_url: "chicken.jpg"}.to_json
+      first_pizza.to_json.should == {
+        name: "chicken supreme",
+        image_url: "chicken.jpg",
+        price: '1.23',
+        ingredients: ['corn', 'tomato']
+      }.to_json
+    end
+    it "should convert array to json" do
+      pizzas = ::Pizza.all
+      pizzas.to_json.should == [
+        {name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
+        {name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
+        {name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
+      ].to_json
     end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrapify
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-05-26 00:00:00.000000000Z
+date: 2012-06-14 00:00:00.000000000Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70261744007960 !ruby/object:Gem::Requirement
+  requirement: &70352195064240 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70261744007960
+  version_requirements: *70352195064240
 - !ruby/object:Gem::Dependency
   name: mocha
-  requirement: &70261744006240 !ruby/object:Gem::Requirement
+  requirement: &70352195063700 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70261744006240
+  version_requirements: *70352195063700
 - !ruby/object:Gem::Dependency
   name: fakeweb
-  requirement: &70261743993080 !ruby/object:Gem::Requirement
+  requirement: &70352195062820 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70261743993080
+  version_requirements: *70352195062820
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  requirement: &70261743990880 !ruby/object:Gem::Requirement
+  requirement: &70352195062120 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70261743990880
+  version_requirements: *70352195062120
 - !ruby/object:Gem::Dependency
   name: activesupport
-  requirement: &70261743989080 !ruby/object:Gem::Requirement
+  requirement: &70352195061500 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70261743989080
+  version_requirements: *70352195061500
 - !ruby/object:Gem::Dependency
   name: json
-  requirement: &70261743987980 !ruby/object:Gem::Requirement
+  requirement: &70352195060200 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -76,7 +76,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70261743987980
+  version_requirements: *70352195060200
 description: ScrApify scraps static html sites to RESTlike APIs
 email:
 - sathish316@gmail.com