content_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e07d1889de277c5d40e886101e3de4f0a36aa618
4
+ data.tar.gz: cc61a64b1634784495b7ad8c28a7510e63e17589
5
+ SHA512:
6
+ metadata.gz: fb737889a42dba37e550d232ddd074c295afa3ea8c0b6d95015d388053c5b73ee8791f3ee9646b76ef45300dac8d765888568b385a964546a2890f184c0310c0
7
+ data.tar.gz: 32fc8c939a11f70ae3ee1ed502a919c99dbf121485aa0fa9704598ea13b18223caed21d86603adde5d4bcfce68ca272634e4d78438aa4de453b037bc3c59b13d
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in content_crawler.gemspec
4
+
5
+ #get browsers
6
+ gem 'browser'
7
+ gem 'watir-webdriver'
8
+ gem 'headless'
9
+ gem 'nokogiri'
10
+ gem 'mechanize'
11
+ gem 'rspec'
12
+
13
+ #gemspec
@@ -0,0 +1,66 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ browser (0.6.0)
5
+ childprocess (0.5.3)
6
+ ffi (~> 1.0, >= 1.0.11)
7
+ diff-lcs (1.2.5)
8
+ domain_name (0.5.21)
9
+ unf (>= 0.0.5, < 1.0.0)
10
+ ffi (1.9.3)
11
+ headless (1.0.2)
12
+ http-cookie (1.0.2)
13
+ domain_name (~> 0.5)
14
+ mechanize (2.7.3)
15
+ domain_name (~> 0.5, >= 0.5.1)
16
+ http-cookie (~> 1.0)
17
+ mime-types (~> 2.0)
18
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
19
+ net-http-persistent (~> 2.5, >= 2.5.2)
20
+ nokogiri (~> 1.4)
21
+ ntlm-http (~> 0.1, >= 0.1.1)
22
+ webrobots (>= 0.0.9, < 0.2)
23
+ mime-types (2.3)
24
+ mini_portile (0.6.0)
25
+ multi_json (1.10.1)
26
+ net-http-digest_auth (1.4)
27
+ net-http-persistent (2.9.4)
28
+ nokogiri (1.6.3.1)
29
+ mini_portile (= 0.6.0)
30
+ ntlm-http (0.1.1)
31
+ rspec (3.1.0)
32
+ rspec-core (~> 3.1.0)
33
+ rspec-expectations (~> 3.1.0)
34
+ rspec-mocks (~> 3.1.0)
35
+ rspec-core (3.1.2)
36
+ rspec-support (~> 3.1.0)
37
+ rspec-expectations (3.1.0)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.1.0)
40
+ rspec-mocks (3.1.0)
41
+ rspec-support (~> 3.1.0)
42
+ rspec-support (3.1.0)
43
+ rubyzip (1.1.6)
44
+ selenium-webdriver (2.43.0)
45
+ childprocess (~> 0.5)
46
+ multi_json (~> 1.0)
47
+ rubyzip (~> 1.0)
48
+ websocket (~> 1.0)
49
+ unf (0.1.4)
50
+ unf_ext
51
+ unf_ext (0.0.6)
52
+ watir-webdriver (0.6.10)
53
+ selenium-webdriver (>= 2.18.0)
54
+ webrobots (0.1.1)
55
+ websocket (1.2.1)
56
+
57
+ PLATFORMS
58
+ ruby
59
+
60
+ DEPENDENCIES
61
+ browser
62
+ headless
63
+ mechanize
64
+ nokogiri
65
+ rspec
66
+ watir-webdriver
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Prakash Natarajan
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,31 @@
1
+ # ContentCrawler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'content_crawler'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install content_crawler
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/content_crawler/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'content_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "content_crawler"
8
+ spec.version = ContentCrawler::VERSION
9
+ spec.authors = ["Prakash Natarajan"]
10
+ spec.email = ["prakashntrjn@gmail.com"]
11
+ spec.summary = %q{Content crwaler}
12
+ spec.description = %q{This will be crawling data from websites. Need to give the xpaths clearly. Will be updating with new functionalities in future}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ end
@@ -0,0 +1,65 @@
1
+ require "content_crawler/version"
2
+ require "content_crawler/crawler_process"
3
+ =begin
4
+ module ContentCrawler
5
+ include CrawlerProcess
6
+ # Your code goes here...
7
+ end
8
+ =end
9
+ module ContentCrawler
10
+ class Crawler
11
+ include CrawlerProcess
12
+
13
+ def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
14
+ super
15
+ end
16
+
17
+ def get_parser_page(crawl_url=nil)
18
+ if (not @browser.nil? and not crawl_url.nil?)
19
+ @browser.goto(crawl_url)
20
+ @page = Nokogiri::HTML(@browser.html)
21
+ elsif (not @agent.nil? and not crawl_url.nil?)
22
+ @page = @agent.get(crawl_url).parser if not crawl_url.nil?
23
+ else
24
+ "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content"
25
+ end
26
+ end
27
+
28
+ def get_simple_text(xpath=nil)
29
+ @page.xpath(xpath).text.strip if not xpath.nil?
30
+ end
31
+
32
+ def get_link_elements(xpath=nil, options={})
33
+ collection_links(@page.xpath(xpath), options) if not xpath.nil?
34
+ end
35
+
36
+ def get_remote_image(xpath=nil, image_store_dir=nil)
37
+ store_remote_image(@page.xpath(xpath), image_store_dir) if not xpath.nil?
38
+ end
39
+
40
+ def get_select_elements(xpath=nil, options={})
41
+ select_collection(@page.xpath(xpath), options) if not xpath.nil?
42
+ end
43
+
44
+ def get_iframe_embed_elements(xpath=nil, options={})
45
+ iframe_embed_collection(@page.xpath(xpath), options) if not xpath.nil?
46
+ end
47
+
48
+ def get_audio_video_elements(xpath=nil, options={})
49
+ audio_video_collection(@page.xpath(xpath), options) if not xpath.nil?
50
+ end
51
+
52
+ def get_object_elements(xpath=nil, options={})
53
+ object_collection(@page.xpath(xpath), options) if not xpath.nil?
54
+ end
55
+
56
+ def get_datalist_elements(xpath=nil, options={})
57
+ datalist_collection(@page.xpath(xpath), options) if not xpath.nil?
58
+ end
59
+
60
+ def close_browser
61
+ super
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,169 @@
1
+ require 'fileutils'
2
+ require 'net/https'
3
+ require 'uri'
4
+ require 'nokogiri'
5
+ require 'headless'
6
+ require 'watir-webdriver'
7
+ require 'mechanize'
8
+
9
+ module CrawlerProcess
10
+ # Initialize the crawler process
11
+ def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
12
+ @base_url = base_url
13
+ case crawler
14
+ when "selenium_webdriver_with_headless"
15
+ @headless = Headless.new
16
+ @headless.start
17
+ watir_web_browser(options[:timeout])
18
+ when "selenium_webdriver_without_headless"
19
+ watir_web_browser(options[:timeout])
20
+ when "mechanize_parser"
21
+ mechanize_parser(options[:user_agent])
22
+ else
23
+ puts "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) to crawl content"
24
+ end
25
+ end
26
+ # Web driver watir browser, which will be opening a browser
27
+ def watir_web_browser(timeout)
28
+ client = Selenium::WebDriver::Remote::Http::Default.new
29
+ client.timeout = timeout
30
+ @browser = Watir::Browser.new :firefox, :http_client => client
31
+ @browser.goto(@base_url)
32
+ @browser
33
+ end
34
+ #Mechanize parser
35
+ def mechanize_parser(user_agent=nil)
36
+ if user_agent.nil?
37
+ @agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE}
38
+ else
39
+ @agent = Mechanize.new{|agent| agent.user_agent_alias = user_agent}
40
+ end
41
+ #@page = @agent.get(@base_url).parser
42
+ @agent
43
+ end
44
+ # To get the anchor tag details
45
+ def collection_links(parser_links, options={})
46
+ links = Array.new
47
+ parser_links = [parser_links].flatten.uniq
48
+ parser_links.each do |link|
49
+ data = {}
50
+ data[:href] = link.attributes["href"].nil? ? " " : link.attributes["href"].value.strip
51
+ data[:text] = link.text.nil? ? " " : link.text.strip
52
+ links << data
53
+ end
54
+ collection_attr(links, options)
55
+ end
56
+ # To get image
57
+ def store_remote_image(image_detail, image_store_dir)
58
+ image_store_dir = check_local_dir(image_store_dir)
59
+ remote_image_urls = iframe_embed_collection(image_detail, {:format => "only_srcs"})
60
+ local_images = []
61
+ remote_image_urls.each do |image_url|
62
+ image_url = "#{@base_url}#{image_url}" if not image_url.include?("http")
63
+ url = URI.parse(image_url)
64
+ response = Net::HTTP.get_response(url)
65
+ if response.is_a?(Net::HTTPSuccess)
66
+ http = Net::HTTP.new(url.host, url.port)
67
+ http.use_ssl = true if url.scheme == "https"
68
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
69
+ http.start do
70
+ http.request_get(url.path) do |res|
71
+ File.open("#{image_store_dir}/#{File.basename(url.path)}",'wb') do |file|
72
+ file.write(res.body)
73
+ end
74
+ end
75
+ end
76
+ local_image = "#{image_store_dir}/#{File.basename(url.path)}"
77
+ local_images << local_image
78
+ end
79
+ end
80
+ local_images
81
+ end
82
+ #To save images in dir
83
+ def check_local_dir(image_store_dir)
84
+ image_store_dir = "#{Dir.home}/crawled_images" if image_store_dir.nil?
85
+ if not Dir.exist?("#{image_store_dir}")
86
+ Dir.mkdir("#{image_store_dir}")
87
+ end
88
+ image_store_dir
89
+ end
90
+ # To get select tag details
91
+ def select_collection(select_detail, options={})
92
+ selects = []
93
+ select_detail.each do |select|
94
+ hash = {}
95
+ hash[:text] = select.text.strip
96
+ hash[:value] = select.attributes["value"].text.strip
97
+ selects << hash
98
+ end
99
+ collection_attr(selects, options)
100
+ end
101
+ # To get iframe links
102
+ def iframe_embed_collection(ifrm_embd_detail, options={})
103
+ ifrm_embds = []
104
+ ifrm_embd_detail.each do |ifrmembd|
105
+ hash = {}
106
+ hash[:src] = ifrmembd.value.strip
107
+ ifrm_embds << hash
108
+ end
109
+ collection_attr(ifrm_embds, options)
110
+ end
111
+ # To get audio video details
112
+ def audio_video_collection(audio_video_detail, options={})
113
+ auvid_collection = []
114
+ audio_video_detail.each do |auvid|
115
+ hash = {}
116
+ hash[:src] = auvid.attributes["src"].value.strip
117
+ hash[:type] = auvid.attributes["type"].value.strip
118
+ auvid_collection << hash
119
+ end
120
+ collection_attr(auvid_collection, options)
121
+ end
122
+ # to Get object details
123
+ def object_collection(object_detail, options={})
124
+ objects = []
125
+ object_detail.each do |object|
126
+ hash = {}
127
+ hash[:text] = object.text.strip
128
+ hash[:value] = object.value.strip
129
+ objects << hash
130
+ end
131
+ collection_attr(objects, options)
132
+ end
133
+ # to get datalists
134
+ def datalist_collection(datalist_detail, options={})
135
+ datalists = []
136
+ datalist_detail.each do |datalist|
137
+ hash = {}
138
+ hash[:value] = datalist.attributes["value"].value.strip
139
+ datalists << hash
140
+ end
141
+ collection_attr(datalists, options)
142
+ end
143
+ # To get particular attribute
144
+ def collection_attr(collection, options)
145
+ collection = [collection].flatten.compact.uniq
146
+ case options[:format]
147
+ when "srcs_types", "texts_values", "texts_srcs", "texts_hrefs"
148
+ collection
149
+ when "only_srcs"
150
+ collection.map{|collobjt| collobjt[:src]}.compact
151
+ when "only_types"
152
+ collection.map{|collobjt| collobjt[:type]}.compact
153
+ when "only_values"
154
+ collection.map{|collobjt| collobjt[:value]}.compact
155
+ when "only_texts"
156
+ collection.map{|collobjt| collobjt[:text]}.compact
157
+ when "only_hrefs"
158
+ collection.map{|collobjt| collobjt[:href]}.compact
159
+ else
160
+ collection
161
+ end
162
+ end
163
+ # close browser
164
+ def close_browser
165
+ @browser.close if not @browser.nil?
166
+ @headless.destroy if not @headless.nil?
167
+ end
168
+
169
+ end
@@ -0,0 +1,3 @@
1
+ module ContentCrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,140 @@
1
+ <html>
2
+ <body>
3
+ <div class="vtabs-content" id="tab-history" style="display: block;">
4
+ <div id="history">
5
+ <table class="form">
6
+ <tbody>
7
+ <tr>
8
+ <td>Bestelstatus:</td>
9
+
10
+ <td><select name="order_status_id">
11
+ <option value="23">Bestelling geannuleerd</option>
12
+
13
+ <option value="17">Bestelling ontvangen</option>
14
+
15
+ <option selected="selected" value="24">Bestelling verzonden</option>
16
+
17
+ <option value="22">Betaling mislukt</option>
18
+
19
+ <option value="20">Betaling ontvangen via Bank</option>
20
+
21
+ <option value="19">Betaling ontvangen via PayPal</option>
22
+
23
+ <option value="21">Betaling via Bank mislukt</option>
24
+
25
+ <option value="18">Betaling via PayPal mislukt</option>
26
+
27
+ <option value="25">Gereed voor afhalen (Delft)</option>
28
+
29
+ <option value="26">Wachten op betaling</option>
30
+ </select></td>
31
+ </tr>
32
+
33
+ <tr>
34
+ <td>Mail naar klant:<span class="help">Wel zichtbaar voor klant bij
35
+ de bestelinfo.</span></td>
36
+
37
+ <td><input name="notify" type="checkbox" value="1"></td>
38
+ </tr>
39
+
40
+ <tr>
41
+ <td><a href="http://www.test.test">Opmerkingen</a></td>
42
+
43
+ <td>
44
+ <textarea cols="40" name="comment" rows="8" style="width: 99%"></textarea>
45
+
46
+ <div style="margin-top: 10px; text-align: right;">
47
+ <a class="button" id="button-history" name=
48
+ "button-history">Geschiedenis bijwerken</a>
49
+ </div>
50
+ </td>
51
+ </tr>
52
+
53
+ <tr>
54
+ <td><img src="https://farm4.staticflickr.com/3147/2462582861_31d51f157c_b.jpg"></td>
55
+ </tr>
56
+
57
+ </tbody>
58
+ </table>
59
+ </div>
60
+ </div>
61
+
62
+
63
+ <frameset rows="10%,80%,10%">
64
+ <frame name="top" src="http://www.tutorialspoint.com/html/top_frame.htm" />
65
+ <frame name="main" src="http://www.tutorialspoint.com/html/main_frame.htm" />
66
+ <frame name="bottom" src="http://www.tutorialspoint.com/html/bottom_frame.htm" />
67
+ <noframes>
68
+ <body>
69
+ Your browser does not support frames.
70
+ </body>
71
+ </noframes>
72
+ </frameset>
73
+
74
+ <embed src="http://www.tutorialspoint.com/html/yourfile.mid" width="100%" height="60" >
75
+ <noembed><img src="http://www.tutorialspoint.com/yourimage.gif" alt="Alternative Media" ></noembed>
76
+ </embed>
77
+
78
+ <embed src="http://www.tutorialspoint.com/html/yourfile.mid" width="100%" height="60" >
79
+ <noembed><img src="http://www.tutorialspoint.com/yourimage.gif" alt="Alternative Media" ></noembed>
80
+
81
+ <iframe name="test_iframe" src="http://www.tutorialspoint.com/html/menu.htm" width="555" height="200">
82
+ Sorry your browser does not support inline frames.
83
+ </iframe>
84
+ <iframe name="test1_iframe" src="http://www.tutorialspoint.com/html/menu.htm" width="555" height="200">
85
+ Sorry your browser does not support inline frames.
86
+ </iframe>
87
+ </embed>
88
+
89
+ <video width="320" height="240" controls>
90
+ <source src="http://www.w3schools.com/movie.ogg" type="video/ogg">
91
+ Your browser does not support the video tag.
92
+ </video>
93
+ <video width="320" height="240" controls>
94
+ <source src="http://www.w3schools.com/movie.mp4" type="video/mp4">
95
+ Your browser does not support the video tag.
96
+ </video>
97
+ <audio controls>
98
+ <source src="http://www.w3schools.com/horse.mp3" type="audio/mpeg">
99
+ Your browser does not support the audio element.
100
+ </audio>
101
+ <audio controls>
102
+ <source src="http://www.w3schools.com/horse.ogg" type="audio/ogg">
103
+ Your browser does not support the audio element.
104
+ </audio>
105
+
106
+ <object width="420" height="315"
107
+ data="http://www.youtube.com/v/XGSy3_Czz8k">
108
+ </object>
109
+
110
+ <object width="420" height="315"
111
+ data="http://www.youtube.com/v/XGSy3_Czz9k">
112
+ </object>
113
+
114
+ <form action="demo_form.asp" method="get">
115
+
116
+ <input list="browsers" name="browser">
117
+ <datalist id="browsers">
118
+ <option value="Internet Explorer">
119
+ <option value="Firefox">
120
+ <option value="Chrome">
121
+ <option value="Opera">
122
+ <option value="Safari">
123
+ </datalist>
124
+ <input type="submit">
125
+ </form>
126
+
127
+ <svg width="300" height="200">
128
+ <polygon points="100,10 40,198 190,78 10,78 160,198"
129
+ style="fill:lime;stroke:purple;stroke-width:5;fill-rule:evenodd;" />
130
+ Sorry, your browser does not support inline SVG.
131
+ </svg>
132
+
133
+ <svg width="300" height="200">
134
+ <polygon points="100,10 40,198 190,90 10,78 160,198"
135
+ style="fill:lime;stroke:purple;stroke-width:5;fill-rule:evenodd;" />
136
+ Sorry, your browser does not support inline SVG.
137
+ </svg>
138
+
139
+ </body>
140
+ </html>
@@ -0,0 +1,5 @@
1
+ RSpec.describe "Addition" do
2
+ it "works" do
3
+ expect(1 + 1).to eq(2)
4
+ end
5
+ end
@@ -0,0 +1,85 @@
1
+ require 'content_crawler'
2
+
3
+ RSpec.describe ContentCrawler::Crawler do
4
+
5
+ it "started the crawler to initialize" do
6
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
7
+ end
8
+
9
+ it "started to crawl the content without crawl_url" do
10
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
11
+ expect(content_crawler.get_parser_page).to eq("Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content")
12
+ end
13
+
14
+ it "checking the parser class, wheather it is possible or not" do
15
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
16
+ expect(content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html").class.name).to eq("Nokogiri::HTML::Document")
17
+ end
18
+
19
+ it "started to get the select tag texts and values" do
20
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
21
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
22
+ expect(content_crawler.get_select_elements("//select/option")).to eq([{:text=>"Bestelling geannuleerd", :value=>"23"}, {:text=>"Bestelling ontvangen", :value=>"17"}, {:text=>"Bestelling verzonden", :value=>"24"}, {:text=>"Betaling mislukt", :value=>"22"}, {:text=>"Betaling ontvangen via Bank", :value=>"20"}, {:text=>"Betaling ontvangen via PayPal", :value=>"19"}, {:text=>"Betaling via Bank mislukt", :value=>"21"}, {:text=>"Betaling via PayPal mislukt", :value=>"18"}, {:text=>"Gereed voor afhalen (Delft)", :value=>"25"}, {:text=>"Wachten op betaling", :value=>"26"}])
23
+ expect(content_crawler.get_select_elements("//select/option", {:format=>"texts_values"})).to eq([{:text=>"Bestelling geannuleerd", :value=>"23"}, {:text=>"Bestelling ontvangen", :value=>"17"}, {:text=>"Bestelling verzonden", :value=>"24"}, {:text=>"Betaling mislukt", :value=>"22"}, {:text=>"Betaling ontvangen via Bank", :value=>"20"}, {:text=>"Betaling ontvangen via PayPal", :value=>"19"}, {:text=>"Betaling via Bank mislukt", :value=>"21"}, {:text=>"Betaling via PayPal mislukt", :value=>"18"}, {:text=>"Gereed voor afhalen (Delft)", :value=>"25"}, {:text=>"Wachten op betaling", :value=>"26"}])
24
+ expect(content_crawler.get_select_elements("//select/option", {:format=>"only_texts"})).to eq(["Bestelling geannuleerd", "Bestelling ontvangen", "Bestelling verzonden", "Betaling mislukt", "Betaling ontvangen via Bank", "Betaling ontvangen via PayPal", "Betaling via Bank mislukt", "Betaling via PayPal mislukt", "Gereed voor afhalen (Delft)", "Wachten op betaling"])
25
+ expect(content_crawler.get_select_elements("//select/option", {:format=>"only_values"})).to eq(["23", "17", "24", "22", "20", "19", "21", "18", "25", "26"])
26
+ end
27
+
28
+ it "started to get the anchor tag texts and hrefs" do
29
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
30
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
31
+ expect(content_crawler.get_link_elements("//a")).to eq([{:href=>"http://www.test.test", :text=>"Opmerkingen"}, {:href=>" ", :text=>"Geschiedenis bijwerken"}])
32
+ expect(content_crawler.get_link_elements("//a", {:format=>"texts_hrefs"})).to eq([{:href=>"http://www.test.test", :text=>"Opmerkingen"}, {:href=>" ", :text=>"Geschiedenis bijwerken"}])
33
+ expect(content_crawler.get_link_elements("//a", {:format=>"only_texts"})).to eq(["Opmerkingen", "Geschiedenis bijwerken"])
34
+ expect(content_crawler.get_link_elements("//a", {:format=>"only_hrefs"})).to eq(["http://www.test.test", " "])
35
+ end
36
+
37
+ it "started to get the iframe texts and srcs" do
38
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
39
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
40
+ expect(content_crawler.get_iframe_embed_elements("//iframe/@src", {:format=>"only_srcs"})).to eq(["http://www.tutorialspoint.com/html/menu.htm"])
41
+ expect(content_crawler.get_iframe_embed_elements("//iframe/@src")).to eq([{:src => "http://www.tutorialspoint.com/html/menu.htm"}])
42
+ end
43
+
44
+ it "started to store the remote image into local system" do
45
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
46
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
47
+ expect(content_crawler.get_remote_image("//img/@src")).to eq(["#{Dir.home}/crawled_images/2462582861_31d51f157c_b.jpg"])
48
+ expect(content_crawler.get_remote_image("//img/@src", "#{Dir.home}/Desktop/crawled_images")).to eq(["#{Dir.home}/Desktop/crawled_images/2462582861_31d51f157c_b.jpg"])
49
+ end
50
+
51
+ it "start to get video source urls" do
52
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
53
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
54
+ expect(content_crawler.get_audio_video_elements("//video/source")).to eq([{:src=>"http://www.w3schools.com/movie.ogg", :type=>"video/ogg"}, {:src=>"http://www.w3schools.com/movie.mp4", :type=>"video/mp4"}])
55
+ expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"srcs_types"})).to eq([{:src=>"http://www.w3schools.com/movie.ogg", :type=>"video/ogg"}, {:src=>"http://www.w3schools.com/movie.mp4", :type=>"video/mp4"}])
56
+ expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"only_srcs"})).to eq(["http://www.w3schools.com/movie.ogg", "http://www.w3schools.com/movie.mp4"])
57
+ expect(content_crawler.get_audio_video_elements("//video/source", {:format=>"only_types"})).to eq(["video/ogg", "video/mp4"])
58
+ end
59
+
60
+ it "start to get audio source urls" do
61
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
62
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
63
+ expect(content_crawler.get_audio_video_elements("//audio/source")).to eq([{:src=>"http://www.w3schools.com/horse.mp3", :type=>"audio/mpeg"}, {:src=>"http://www.w3schools.com/horse.ogg", :type=>"audio/ogg"}])
64
+ expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"srcs_types"})).to eq([{:src=>"http://www.w3schools.com/horse.mp3", :type=>"audio/mpeg"}, {:src=>"http://www.w3schools.com/horse.ogg", :type=>"audio/ogg"}])
65
+ expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"only_srcs"})).to eq(["http://www.w3schools.com/horse.mp3", "http://www.w3schools.com/horse.ogg"])
66
+ expect(content_crawler.get_audio_video_elements("//audio/source", {:format=>"only_types"})).to eq(["audio/mpeg", "audio/ogg"])
67
+ end
68
+
69
+ it "start to get object source urls" do
70
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
71
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
72
+ expect(content_crawler.get_object_elements("//object/@data")).to eq([{:text=>"http://www.youtube.com/v/XGSy3_Czz8k", :value=>"http://www.youtube.com/v/XGSy3_Czz8k"}, {:text=>"http://www.youtube.com/v/XGSy3_Czz9k", :value=>"http://www.youtube.com/v/XGSy3_Czz9k"}])
73
+ expect(content_crawler.get_object_elements("//object/@data", {:format=>"texts_values"})).to eq([{:text=>"http://www.youtube.com/v/XGSy3_Czz8k", :value=>"http://www.youtube.com/v/XGSy3_Czz8k"}, {:text=>"http://www.youtube.com/v/XGSy3_Czz9k", :value=>"http://www.youtube.com/v/XGSy3_Czz9k"}])
74
+ expect(content_crawler.get_object_elements("//object/@data", {:format=>"only_texts"})).to eq(["http://www.youtube.com/v/XGSy3_Czz8k", "http://www.youtube.com/v/XGSy3_Czz9k"])
75
+ expect(content_crawler.get_object_elements("//object/@data", {:format=>"only_values"})).to eq(["http://www.youtube.com/v/XGSy3_Czz8k", "http://www.youtube.com/v/XGSy3_Czz9k"])
76
+ end
77
+
78
+ it "start to get datalist values" do
79
+ content_crawler = ContentCrawler::Crawler.new("mechanize_parser", "file://#{Dir.pwd}/public/html_test.html", {:user_agent => "Mac Safari"})
80
+ content_crawler.get_parser_page("file://#{Dir.pwd}/public/html_test.html")
81
+ expect(content_crawler.get_datalist_elements("//datalist/option")).to eq([{:value=>"Internet Explorer"}, {:value=>"Firefox"}, {:value=>"Chrome"}, {:value=>"Opera"}, {:value=>"Safari"}])
82
+ expect(content_crawler.get_datalist_elements("//datalist/option", {:format=>"only_values"})).to eq(["Internet Explorer", "Firefox", "Chrome", "Opera", "Safari"])
83
+ end
84
+
85
+ end
@@ -0,0 +1,89 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # The generated `.rspec` file contains `--require spec_helper` which will cause this
4
+ # file to always be loaded, without a need to explicitly require it in any files.
5
+ #
6
+ # Given that it is always loaded, you are encouraged to keep this file as
7
+ # light-weight as possible. Requiring heavyweight dependencies from this file
8
+ # will add to the boot time of your test suite on EVERY test run, even for an
9
+ # individual file that may not need all of that loaded. Instead, consider making
10
+ # a separate helper file that requires the additional dependencies and performs
11
+ # the additional setup, and require it from the spec files that actually need it.
12
+ #
13
+ # The `.rspec` file also contains a few flags that are not defaults but that
14
+ # users commonly want.
15
+ #
16
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
17
+ RSpec.configure do |config|
18
+ # rspec-expectations config goes here. You can use an alternate
19
+ # assertion/expectation library such as wrong or the stdlib/minitest
20
+ # assertions if you prefer.
21
+ config.expect_with :rspec do |expectations|
22
+ # This option will default to `true` in RSpec 4. It makes the `description`
23
+ # and `failure_message` of custom matchers include text for helper methods
24
+ # defined using `chain`, e.g.:
25
+ # be_bigger_than(2).and_smaller_than(4).description
26
+ # # => "be bigger than 2 and smaller than 4"
27
+ # ...rather than:
28
+ # # => "be bigger than 2"
29
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
30
+ end
31
+
32
+ # rspec-mocks config goes here. You can use an alternate test double
33
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
34
+ config.mock_with :rspec do |mocks|
35
+ # Prevents you from mocking or stubbing a method that does not exist on
36
+ # a real object. This is generally recommended, and will default to
37
+ # `true` in RSpec 4.
38
+ mocks.verify_partial_doubles = true
39
+ end
40
+
41
+ # The settings below are suggested to provide a good initial experience
42
+ # with RSpec, but feel free to customize to your heart's content.
43
+ =begin
44
+ # These two settings work together to allow you to limit a spec run
45
+ # to individual examples or groups you care about by tagging them with
46
+ # `:focus` metadata. When nothing is tagged with `:focus`, all examples
47
+ # get run.
48
+ config.filter_run :focus
49
+ config.run_all_when_everything_filtered = true
50
+
51
+ # Limits the available syntax to the non-monkey patched syntax that is recommended.
52
+ # For more details, see:
53
+ # - http://myronmars.to/n/dev-blog/2012/06/rspecs-new-expectation-syntax
54
+ # - http://teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
55
+ # - http://myronmars.to/n/dev-blog/2014/05/notable-changes-in-rspec-3#new__config_option_to_disable_rspeccore_monkey_patching
56
+ config.disable_monkey_patching!
57
+
58
+ # This setting enables warnings. It's recommended, but in some cases may
59
+ # be too noisy due to issues in dependencies.
60
+ config.warnings = true
61
+
62
+ # Many RSpec users commonly either run the entire suite or an individual
63
+ # file, and it's useful to allow more verbose output when running an
64
+ # individual spec file.
65
+ if config.files_to_run.one?
66
+ # Use the documentation formatter for detailed output,
67
+ # unless a formatter has already been configured
68
+ # (e.g. via a command-line flag).
69
+ config.default_formatter = 'doc'
70
+ end
71
+
72
+ # Print the 10 slowest examples and example groups at the
73
+ # end of the spec run, to help surface which specs are running
74
+ # particularly slow.
75
+ config.profile_examples = 10
76
+
77
+ # Run specs in random order to surface order dependencies. If you find an
78
+ # order dependency and want to debug it, you can fix the order by providing
79
+ # the seed, which is printed after each run.
80
+ # --seed 1234
81
+ config.order = :random
82
+
83
+ # Seed global randomization in this process using the `--seed` CLI option.
84
+ # Setting this allows you to use `--seed` to deterministically reproduce
85
+ # test failures related to randomization by passing the same `--seed` value
86
+ # as the one that triggered the failure.
87
+ Kernel.srand config.seed
88
+ =end
89
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: content_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Prakash Natarajan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: This will be crawling data from websites. Need to give the xpaths clearly.
56
+ Will be updating with new functionalities in future
57
+ email:
58
+ - prakashntrjn@gmail.com
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".rspec"
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE.txt
67
+ - README.md
68
+ - Rakefile
69
+ - content_crawler.gemspec
70
+ - lib/content_crawler.rb
71
+ - lib/content_crawler/crawler_process.rb
72
+ - lib/content_crawler/version.rb
73
+ - public/html_test.html
74
+ - spec/addition_spec.rb
75
+ - spec/content_crawler_spec.rb
76
+ - spec/spec_helper.rb
77
+ homepage: ''
78
+ licenses:
79
+ - MIT
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 2.2.2
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: Content crwaler
101
+ test_files:
102
+ - spec/addition_spec.rb
103
+ - spec/content_crawler_spec.rb
104
+ - spec/spec_helper.rb