RubyGems - content_crawler - Versions diffs - 0.0.1 - Mend

content_crawler 0.0.1

Files changed (16) hide show

checksums.yaml +7 -0
data/.rspec +2 -0
data/Gemfile +13 -0
data/Gemfile.lock +66 -0
data/LICENSE.txt +22 -0
data/README.md +31 -0
data/Rakefile +2 -0
data/content_crawler.gemspec +24 -0
data/lib/content_crawler.rb +65 -0
data/lib/content_crawler/crawler_process.rb +169 -0
data/lib/content_crawler/version.rb +3 -0
data/public/html_test.html +140 -0
data/spec/addition_spec.rb +5 -0
data/spec/content_crawler_spec.rb +85 -0
data/spec/spec_helper.rb +89 -0
metadata +104 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: e07d1889de277c5d40e886101e3de4f0a36aa618
+  data.tar.gz: cc61a64b1634784495b7ad8c28a7510e63e17589
+SHA512:
+  metadata.gz: fb737889a42dba37e550d232ddd074c295afa3ea8c0b6d95015d388053c5b73ee8791f3ee9646b76ef45300dac8d765888568b385a964546a2890f184c0310c0
+  data.tar.gz: 32fc8c939a11f70ae3ee1ed502a919c99dbf121485aa0fa9704598ea13b18223caed21d86603adde5d4bcfce68ca272634e4d78438aa4de453b037bc3c59b13d

data/.rspec ADDED

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --require spec_helper

data/Gemfile ADDED

@@ -0,0 +1,13 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in content_crawler.gemspec
+#get browsers
+gem 'browser'
+gem 'watir-webdriver'
+gem 'headless'
+gem 'nokogiri'
+gem 'mechanize'
+gem 'rspec'
+#gemspec

data/Gemfile.lock ADDED

@@ -0,0 +1,66 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    browser (0.6.0)
+    childprocess (0.5.3)
+      ffi (~> 1.0, >= 1.0.11)
+    diff-lcs (1.2.5)
+    domain_name (0.5.21)
+      unf (>= 0.0.5, < 1.0.0)
+    ffi (1.9.3)
+    headless (1.0.2)
+    http-cookie (1.0.2)
+      domain_name (~> 0.5)
+    mechanize (2.7.3)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (~> 2.0)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.4)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (2.3)
+    mini_portile (0.6.0)
+    multi_json (1.10.1)
+    net-http-digest_auth (1.4)
+    net-http-persistent (2.9.4)
+    nokogiri (1.6.3.1)
+      mini_portile (= 0.6.0)
+    ntlm-http (0.1.1)
+    rspec (3.1.0)
+      rspec-core (~> 3.1.0)
+      rspec-expectations (~> 3.1.0)
+      rspec-mocks (~> 3.1.0)
+    rspec-core (3.1.2)
+      rspec-support (~> 3.1.0)
+    rspec-expectations (3.1.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.1.0)
+    rspec-mocks (3.1.0)
+      rspec-support (~> 3.1.0)
+    rspec-support (3.1.0)
+    rubyzip (1.1.6)
+    selenium-webdriver (2.43.0)
+      childprocess (~> 0.5)
+      multi_json (~> 1.0)
+      rubyzip (~> 1.0)
+      websocket (~> 1.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.6)
+    watir-webdriver (0.6.10)
+      selenium-webdriver (>= 2.18.0)
+    webrobots (0.1.1)
+    websocket (1.2.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  browser
+  headless
+  mechanize
+  nokogiri
+  rspec
+  watir-webdriver

data/LICENSE.txt ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Prakash Natarajan
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,31 @@
+# ContentCrawler
+TODO: Write a gem description
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'content_crawler'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install content_crawler
+## Usage
+TODO: Write usage instructions here
+## Contributing
+1. Fork it ( https://github.com/[my-github-username]/content_crawler/fork )
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request

data/Rakefile ADDED

	@@ -0,0 +1,2 @@
1	+ require "bundler/gem_tasks"
2	+

data/content_crawler.gemspec ADDED

@@ -0,0 +1,24 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'content_crawler/version'
+Gem::Specification.new do |spec|
+  spec.name          = "content_crawler"
+  spec.version       = ContentCrawler::VERSION
+  spec.authors       = ["Prakash Natarajan"]
+  spec.email         = ["prakashntrjn@gmail.com"]
+  spec.summary       = %q{Content crwaler}
+  spec.description   = %q{This will be crawling data from websites. Need to give the xpaths clearly. Will be updating with new functionalities in future}
+  spec.homepage      = ""
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.7"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec"
+end

data/lib/content_crawler.rb ADDED

@@ -0,0 +1,65 @@
+require "content_crawler/version"
+require "content_crawler/crawler_process"
+=begin
+module ContentCrawler
+  include CrawlerProcess
+  # Your code goes here...
+end
+=end
+module ContentCrawler
+  class Crawler
+  include CrawlerProcess
+  def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
+	    super
+  end
+  def get_parser_page(crawl_url=nil)
+      if (not @browser.nil? and not crawl_url.nil?)
+          @browser.goto(crawl_url)
+          @page = Nokogiri::HTML(@browser.html)
+      elsif (not @agent.nil? and not crawl_url.nil?)
+          @page = @agent.get(crawl_url).parser if not crawl_url.nil?
+      else
+          "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content"
+      end
+  end
+  def get_simple_text(xpath=nil)
+      @page.xpath(xpath).text.strip if not xpath.nil?
+  end
+  def get_link_elements(xpath=nil, options={})
+      collection_links(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def get_remote_image(xpath=nil, image_store_dir=nil)
+      store_remote_image(@page.xpath(xpath), image_store_dir) if not xpath.nil?
+  end
+  def get_select_elements(xpath=nil, options={})
+      select_collection(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def get_iframe_embed_elements(xpath=nil, options={})
+      iframe_embed_collection(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def get_audio_video_elements(xpath=nil, options={})
+      audio_video_collection(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def get_object_elements(xpath=nil, options={})
+      object_collection(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def get_datalist_elements(xpath=nil, options={})
+      datalist_collection(@page.xpath(xpath), options) if not xpath.nil?
+  end
+  def close_browser
+      super
+  end
+  end
+end

data/lib/content_crawler/crawler_process.rb ADDED

@@ -0,0 +1,169 @@
+require 'fileutils'
+require 'net/https'
+require 'uri'
+require 'nokogiri'
+require 'headless'
+require 'watir-webdriver'
+require 'mechanize'
+module CrawlerProcess
+# Initialize the crawler process
+  def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
+      @base_url = base_url
+      case crawler
+	      when "selenium_webdriver_with_headless"
+		       @headless = Headless.new
+		       @headless.start
+           watir_web_browser(options[:timeout])
+        when "selenium_webdriver_without_headless"
+           watir_web_browser(options[:timeout])
+	      when "mechanize_parser"
+           mechanize_parser(options[:user_agent])
+	      else
+		      puts "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) to crawl content"
+      end
+  end
+# Web driver watir browser, which will be opening a browser
+  def watir_web_browser(timeout)
+      client = Selenium::WebDriver::Remote::Http::Default.new
+      client.timeout = timeout
+      @browser = Watir::Browser.new :firefox, :http_client => client
+      @browser.goto(@base_url)
+      @browser
+  end
+#Mechanize parser
+  def mechanize_parser(user_agent=nil)
+      if user_agent.nil?
+        @agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE}
+      else
+        @agent = Mechanize.new{|agent| agent.user_agent_alias = user_agent}
+      end
+      #@page = @agent.get(@base_url).parser
+      @agent
+  end
+# To get the anchor tag details
+  def collection_links(parser_links, options={})
+    links = Array.new
+    parser_links = [parser_links].flatten.uniq
+    parser_links.each do |link|
+       data = {}
+       data[:href] = link.attributes["href"].nil? ? " " : link.attributes["href"].value.strip
+       data[:text] = link.text.nil? ? " " : link.text.strip
+       links << data
+    end
+    collection_attr(links, options)
+  end
+# To get image
+  def store_remote_image(image_detail, image_store_dir)
+      image_store_dir = check_local_dir(image_store_dir)
+      remote_image_urls = iframe_embed_collection(image_detail, {:format => "only_srcs"})
+      local_images = []
+      remote_image_urls.each do |image_url|
+        image_url = "#{@base_url}#{image_url}" if not image_url.include?("http")
+        url = URI.parse(image_url)
+        response = Net::HTTP.get_response(url)
+        if response.is_a?(Net::HTTPSuccess)
+            http = Net::HTTP.new(url.host, url.port)
+            http.use_ssl = true if url.scheme == "https"
+            http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+            http.start do
+              http.request_get(url.path) do |res|
+                File.open("#{image_store_dir}/#{File.basename(url.path)}",'wb') do |file|
+                    file.write(res.body)
+                end
+              end
+            end
+            local_image = "#{image_store_dir}/#{File.basename(url.path)}"
+            local_images << local_image
+        end
+      end
+      local_images
+  end
+#To save images in dir
+  def check_local_dir(image_store_dir)
+      image_store_dir = "#{Dir.home}/crawled_images" if image_store_dir.nil?
+      if not Dir.exist?("#{image_store_dir}")
+        Dir.mkdir("#{image_store_dir}")
+      end
+      image_store_dir
+  end
+# To get select tag details
+  def select_collection(select_detail, options={})
+      selects = []
+       select_detail.each do |select|
+           hash = {}
+           hash[:text] = select.text.strip
+           hash[:value] = select.attributes["value"].text.strip
+           selects << hash
+       end
+      collection_attr(selects, options)
+  end
+# To get iframe links
+  def iframe_embed_collection(ifrm_embd_detail, options={})
+      ifrm_embds = []
+      ifrm_embd_detail.each do |ifrmembd|
+        hash = {}
+        hash[:src] = ifrmembd.value.strip
+        ifrm_embds << hash
+      end
+      collection_attr(ifrm_embds, options)
+  end
+# To get audio video details
+  def audio_video_collection(audio_video_detail, options={})
+      auvid_collection = []
+      audio_video_detail.each do |auvid|
+        hash = {}
+        hash[:src] = auvid.attributes["src"].value.strip
+        hash[:type] = auvid.attributes["type"].value.strip
+        auvid_collection << hash
+      end
+      collection_attr(auvid_collection, options)
+  end
+# to Get object details
+  def object_collection(object_detail, options={})
+       objects = []
+       object_detail.each do |object|
+           hash = {}
+           hash[:text] = object.text.strip
+           hash[:value] = object.value.strip
+           objects << hash
+       end
+      collection_attr(objects, options)
+  end
+# to get datalists
+  def datalist_collection(datalist_detail, options={})
+       datalists = []
+       datalist_detail.each do |datalist|
+           hash = {}
+           hash[:value] = datalist.attributes["value"].value.strip
+           datalists << hash
+       end
+      collection_attr(datalists, options)
+  end
+# To get particular attribute
+  def collection_attr(collection, options)
+    collection = [collection].flatten.compact.uniq
+    case options[:format]
+      when "srcs_types", "texts_values", "texts_srcs", "texts_hrefs"
+        collection
+      when "only_srcs"
+        collection.map{|collobjt| collobjt[:src]}.compact
+      when "only_types"
+        collection.map{|collobjt| collobjt[:type]}.compact
+      when "only_values"
+        collection.map{|collobjt| collobjt[:value]}.compact
+      when "only_texts"
+        collection.map{|collobjt| collobjt[:text]}.compact
+      when "only_hrefs"
+        collection.map{|collobjt| collobjt[:href]}.compact
+      else
+        collection
+    end
+  end
+# close browser
+  def close_browser
+    @browser.close if not @browser.nil?
+    @headless.destroy if not @headless.nil?
+  end
+end

data/lib/content_crawler/version.rb ADDED

@@ -0,0 +1,3 @@
+module ContentCrawler
+  VERSION = "0.0.1"
+end

data/public/html_test.html ADDED

@@ -0,0 +1,140 @@
+<html>
+<body>
+<div class="vtabs-content" id="tab-history" style="display: block;">
+  <div id="history">
+  <table class="form">
+    <tbody>
+      <tr>
+        <td>Bestelstatus:</td>
+        <td><select name="order_status_id">
+          <option value="23">Bestelling geannuleerd</option>
+          <option value="17">Bestelling ontvangen</option>
+          <option selected="selected" value="24">Bestelling verzonden</option>
+          <option value="22">Betaling mislukt</option>
+          <option value="20">Betaling ontvangen via Bank</option>
+          <option value="19">Betaling ontvangen via PayPal</option>
+          <option value="21">Betaling via Bank mislukt</option>
+          <option value="18">Betaling via PayPal mislukt</option>
+          <option value="25">Gereed voor afhalen (Delft)</option>
+          <option value="26">Wachten op betaling</option>
+        </select></td>
+      </tr>
+      <tr>
+        <td>Mail naar klant:<span class="help">Wel zichtbaar voor klant bij
+        de bestelinfo.</span></td>
+        <td><input name="notify" type="checkbox" value="1"></td>
+      </tr>
+      <tr>
+        <td><a href="http://www.test.test">Opmerkingen</a></td>
+        <td>
+          <textarea cols="40" name="comment" rows="8" style="width: 99%"></textarea>
+          <div style="margin-top: 10px; text-align: right;">
+            <a class="button" id="button-history" name=
+            "button-history">Geschiedenis bijwerken</a>
+          </div>
+        </td>
+      </tr>
+      <tr>
+      <td><img src="https://farm4.staticflickr.com/3147/2462582861_31d51f157c_b.jpg"></td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+</div>
+<frameset rows="10%,80%,10%">
+   <frame name="top" src="http://www.tutorialspoint.com/html/top_frame.htm" />
+   <frame name="main" src="http://www.tutorialspoint.com/html/main_frame.htm" />
+   <frame name="bottom" src="http://www.tutorialspoint.com/html/bottom_frame.htm" />
+   <noframes>
+   <body>
+      Your browser does not support frames.
+   </body>
+   </noframes>
+</frameset>
+<embed src="http://www.tutorialspoint.com/html/yourfile.mid" width="100%" height="60" >
+   <noembed><img src="http://www.tutorialspoint.com/yourimage.gif" alt="Alternative Media" ></noembed>
+</embed>
+<embed src="http://www.tutorialspoint.com/html/yourfile.mid" width="100%" height="60" >
+   <noembed><img src="http://www.tutorialspoint.com/yourimage.gif" alt="Alternative Media" ></noembed>
+<iframe name="test_iframe" src="http://www.tutorialspoint.com/html/menu.htm" width="555" height="200">
+   Sorry your browser does not support inline frames.
+</iframe>
+<iframe name="test1_iframe" src="http://www.tutorialspoint.com/html/menu.htm" width="555" height="200">
+   Sorry your browser does not support inline frames.
+</iframe>
+</embed>
+<video width="320" height="240" controls>
+  <source src="http://www.w3schools.com/movie.ogg" type="video/ogg">
+  Your browser does not support the video tag.
+</video>
+<video width="320" height="240" controls>
+  <source src="http://www.w3schools.com/movie.mp4" type="video/mp4">
+  Your browser does not support the video tag.
+</video>
+<audio controls>
+  <source src="http://www.w3schools.com/horse.mp3" type="audio/mpeg">
+  Your browser does not support the audio element.
+</audio>
+<audio controls>
+  <source src="http://www.w3schools.com/horse.ogg" type="audio/ogg">
+  Your browser does not support the audio element.
+</audio>
+<object width="420" height="315"
+data="http://www.youtube.com/v/XGSy3_Czz8k">
+</object>
+<object width="420" height="315"
+data="http://www.youtube.com/v/XGSy3_Czz9k">
+</object>
+<form action="demo_form.asp" method="get">
+<input list="browsers" name="browser">
+<datalist id="browsers">
+  <option value="Internet Explorer">
+  <option value="Firefox">
+  <option value="Chrome">
+  <option value="Opera">
+  <option value="Safari">
+</datalist>
+<input type="submit">
+</form>
+<svg width="300" height="200">
+   <polygon points="100,10 40,198 190,78 10,78 160,198"
+   style="fill:lime;stroke:purple;stroke-width:5;fill-rule:evenodd;" />
+   Sorry, your browser does not support inline SVG.
+</svg>
+<svg width="300" height="200">
+   <polygon points="100,10 40,198 190,90 10,78 160,198"
+   style="fill:lime;stroke:purple;stroke-width:5;fill-rule:evenodd;" />
+   Sorry, your browser does not support inline SVG.
+</svg>
+</body>
+</html>

data/spec/addition_spec.rb ADDED

@@ -0,0 +1,5 @@
+RSpec.describe "Addition" do
+  it "works" do
+    expect(1 + 1).to eq(2)
+  end
+end

data/spec/content_crawler_spec.rb ADDED

@@ -0,0 +1,85 @@
+require 'content_crawler'
+RSpec.describe ContentCrawler::Crawler do
+  it "started the crawler to initialize" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+  end
+  it "started to crawl the content without crawl_url" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    expect(content_crawler.get_parser_page).to eq("Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content")
+  end
+  it "checking the parser class, wheather it is possible or not" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    expect(content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html").class.name).to eq("Nokogiri::HTML::Document")
+  end
+  it "started to get the select tag texts and values" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_select_elements("//select/option")).to eq([{:text=>"Bestelling geannuleerd", :value=>"23"}, {:text=>"Bestelling ontvangen", :value=>"17"}, {:text=>"Bestelling verzonden", :value=>"24"}, {:text=>"Betaling mislukt", :value=>"22"}, {:text=>"Betaling ontvangen via Bank", :value=>"20"}, {:text=>"Betaling ontvangen via PayPal", :value=>"19"}, {:text=>"Betaling via Bank mislukt", :value=>"21"}, {:text=>"Betaling via PayPal mislukt", :value=>"18"}, {:text=>"Gereed voor afhalen (Delft)", :value=>"25"}, {:text=>"Wachten op betaling", :value=>"26"}])
+    expect(content_crawler.get_select_elements("//select/option", {:format=>"texts_values"})).to eq([{:text=>"Bestelling geannuleerd", :value=>"23"}, {:text=>"Bestelling ontvangen", :value=>"17"}, {:text=>"Bestelling verzonden", :value=>"24"}, {:text=>"Betaling mislukt", :value=>"22"}, {:text=>"Betaling ontvangen via Bank", :value=>"20"}, {:text=>"Betaling ontvangen via PayPal", :value=>"19"}, {:text=>"Betaling via Bank mislukt", :value=>"21"}, {:text=>"Betaling via PayPal mislukt", :value=>"18"}, {:text=>"Gereed voor afhalen (Delft)", :value=>"25"}, {:text=>"Wachten op betaling", :value=>"26"}])
+    expect(content_crawler.get_select_elements("//select/option", {:format=>"only_texts"})).to eq(["Bestelling geannuleerd", "Bestelling ontvangen", "Bestelling verzonden", "Betaling mislukt", "Betaling ontvangen via Bank", "Betaling ontvangen via PayPal", "Betaling via Bank mislukt", "Betaling via PayPal mislukt", "Gereed voor afhalen (Delft)", "Wachten op betaling"])
+    expect(content_crawler.get_select_elements("//select/option", {:format=>"only_values"})).to eq(["23", "17", "24", "22", "20", "19", "21", "18", "25", "26"])
+  end
+  it "started to get the anchor tag texts and hrefs" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_link_elements("//a")).to eq([{:href=>"http://www.test.test", :text=>"Opmerkingen"}, {:href=>" ", :text=>"Geschiedenis bijwerken"}])
+    expect(content_crawler.get_link_elements("//a", {:format=>"texts_hrefs"})).to eq([{:href=>"http://www.test.test", :text=>"Opmerkingen"}, {:href=>" ", :text=>"Geschiedenis bijwerken"}])
+    expect(content_crawler.get_link_elements("//a", {:format=>"only_texts"})).to eq(["Opmerkingen", "Geschiedenis bijwerken"])
+    expect(content_crawler.get_link_elements("//a", {:format=>"only_hrefs"})).to eq(["http://www.test.test", " "])
+  end
+  it "started to get the iframe texts and srcs" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_iframe_embed_elements("//iframe/@src", {:format=>"only_srcs"})).to eq(["http://www.tutorialspoint.com/html/menu.htm"])
+    expect(content_crawler.get_iframe_embed_elements("//iframe/@src")).to eq([{:src => "http://www.tutorialspoint.com/html/menu.htm"}])
+  end
+  it "started to store the remote image into local system" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_remote_image("//img/@src")).to eq(["#{Dir.home}/crawled_images/2462582861_31d51f157c_b.jpg"])
+    expect(content_crawler.get_remote_image("//img/@src", "#{Dir.home}/Desktop/crawled_images")).to eq(["#{Dir.home}/Desktop/crawled_images/2462582861_31d51f157c_b.jpg"])
+  end
+  it "start to get video source urls" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_audio_video_elements("//video/source")).to eq([{:src=>"http://www.w3schools.com/movie.ogg", :type=>"video/ogg"}, {:src=>"http://www.w3schools.com/movie.mp4", :type=>"video/mp4"}])
+    expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"srcs_types"})).to eq([{:src=>"http://www.w3schools.com/movie.ogg", :type=>"video/ogg"}, {:src=>"http://www.w3schools.com/movie.mp4", :type=>"video/mp4"}])
+    expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"only_srcs"})).to eq(["http://www.w3schools.com/movie.ogg", "http://www.w3schools.com/movie.mp4"])
+    expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"only_types"})).to eq(["video/ogg", "video/mp4"])
+  end
+  it "start to get audio source urls" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_audio_video_elements("//audio/source")).to eq([{:src=>"http://www.w3schools.com/horse.mp3", :type=>"audio/mpeg"}, {:src=>"http://www.w3schools.com/horse.ogg", :type=>"audio/ogg"}])
+    expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"srcs_types"})).to eq([{:src=>"http://www.w3schools.com/horse.mp3", :type=>"audio/mpeg"}, {:src=>"http://www.w3schools.com/horse.ogg", :type=>"audio/ogg"}])
+    expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"only_srcs"})).to eq(["http://www.w3schools.com/horse.mp3", "http://www.w3schools.com/horse.ogg"])
+    expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"only_types"})).to eq(["audio/mpeg", "audio/ogg"])
+  end
+  it "start to get object source urls" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_object_elements("//object/@data")).to eq([{:text=>"http://www.youtube.com/v/XGSy3_Czz8k", :value=>"http://www.youtube.com/v/XGSy3_Czz8k"}, {:text=>"http://www.youtube.com/v/XGSy3_Czz9k", :value=>"http://www.youtube.com/v/XGSy3_Czz9k"}])
+    expect(content_crawler.get_object_elements("//object/@data", {:format=>"texts_values"})).to eq([{:text=>"http://www.youtube.com/v/XGSy3_Czz8k", :value=>"http://www.youtube.com/v/XGSy3_Czz8k"}, {:text=>"http://www.youtube.com/v/XGSy3_Czz9k", :value=>"http://www.youtube.com/v/XGSy3_Czz9k"}])
+    expect(content_crawler.get_object_elements("//object/@data", {:format=>"only_texts"})).to eq(["http://www.youtube.com/v/XGSy3_Czz8k", "http://www.youtube.com/v/XGSy3_Czz9k"])
+    expect(content_crawler.get_object_elements("//object/@data", {:format=>"only_values"})).to eq(["http://www.youtube.com/v/XGSy3_Czz8k", "http://www.youtube.com/v/XGSy3_Czz9k"])
+  end
+  it "start to get datalist values" do
+    content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
+    content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
+    expect(content_crawler.get_datalist_elements("//datalist/option")).to eq([{:value=>"Internet Explorer"}, {:value=>"Firefox"}, {:value=>"Chrome"}, {:value=>"Opera"}, {:value=>"Safari"}])
+    expect(content_crawler.get_datalist_elements("//datalist/option", {:format=>"only_values"})).to eq(["Internet Explorer", "Firefox", "Chrome", "Opera", "Safari"])
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,89 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause this
+# file to always be loaded, without a need to explicitly require it in any files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need it.
+#
+# The `.rspec` file also contains a few flags that are not defaults but that
+# users commonly want.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  # rspec-expectations config goes here. You can use an alternate
+  # assertion/expectation library such as wrong or the stdlib/minitest
+  # assertions if you prefer.
+  config.expect_with :rspec do |expectations|
+    # This option will default to `true` in RSpec 4. It makes the `description`
+    # and `failure_message` of custom matchers include text for helper methods
+    # defined using `chain`, e.g.:
+    # be_bigger_than(2).and_smaller_than(4).description
+    #   # => "be bigger than 2 and smaller than 4"
+    # ...rather than:
+    #   # => "be bigger than 2"
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  # rspec-mocks config goes here. You can use an alternate test double
+  # library (such as bogus or mocha) by changing the `mock_with` option here.
+  config.mock_with :rspec do |mocks|
+    # Prevents you from mocking or stubbing a method that does not exist on
+    # a real object. This is generally recommended, and will default to
+    # `true` in RSpec 4.
+    mocks.verify_partial_doubles = true
+  end
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+  # These two settings work together to allow you to limit a spec run
+  # to individual examples or groups you care about by tagging them with
+  # `:focus` metadata. When nothing is tagged with `:focus`, all examples
+  # get run.
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+  # Limits the available syntax to the non-monkey patched syntax that is recommended.
+  # For more details, see:
+  #   - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
+  #   - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
+  #   - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
+  config.disable_monkey_patching!
+  # This setting enables warnings. It's recommended, but in some cases may
+  # be too noisy due to issues in dependencies.
+  config.warnings = true
+  # Many RSpec users commonly either run the entire suite or an individual
+  # file, and it's useful to allow more verbose output when running an
+  # individual spec file.
+  if config.files_to_run.one?
+    # Use the documentation formatter for detailed output,
+    # unless a formatter has already been configured
+    # (e.g. via a command-line flag).
+    config.default_formatter = 'doc'
+  end
+  # Print the 10 slowest examples and example groups at the
+  # end of the spec run, to help surface which specs are running
+  # particularly slow.
+  config.profile_examples = 10
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = :random
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+=end
+end

metadata ADDED

@@ -0,0 +1,104 @@
+--- !ruby/object:Gem::Specification
+name: content_crawler
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Prakash Natarajan
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-12-23 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.7'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: This will be crawling data from websites. Need to give the xpaths clearly.
+  Will be updating with new functionalities in future
+email:
+- prakashntrjn@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".rspec"
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.md
+- Rakefile
+- content_crawler.gemspec
+- lib/content_crawler.rb
+- lib/content_crawler/crawler_process.rb
+- lib/content_crawler/version.rb
+- public/html_test.html
+- spec/addition_spec.rb
+- spec/content_crawler_spec.rb
+- spec/spec_helper.rb
+homepage: ''
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Content crwaler
+test_files:
+- spec/addition_spec.rb
+- spec/content_crawler_spec.rb
+- spec/spec_helper.rb