alacrity 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +15 -0
  2. data/LICENSE.txt +19 -0
  3. data/README.md +43 -0
  4. data/alacrity.gemspec +14 -0
  5. data/lib/alacrity.rb +34 -0
  6. metadata +62 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ODQ1NTI5Yjc3NzNjZWI2NDhiYTg1Yzk3MGZlNmEyZDRmMmE3NDI0NQ==
5
+ data.tar.gz: !binary |-
6
+ NTRhNjQ5OWEwZDI0NjFlNzdkN2EyOGEwNDcyMWExYjE0OTIxMzZjNw==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ NjhhNjQyYmM3NzhiZjQzMDk1ZjIwYjczMWYxM2JkZDQyYzE0ZTdjYzg1YTFl
10
+ ZGUwODMyMTY3MGJjMGJkZDYwMjc1NDQ0ZjFlZTc0OWQ1MzQ5MDk1NGY2OGRm
11
+ MWE1MWZlZTFlMzE3N2E4MTI4ZjMyMmQzMGViNGJkMjIxYzE2NjU=
12
+ data.tar.gz: !binary |-
13
+ N2Y0MWI2ZmU0NjVjZmUyYTMxYTg2MDI3NjI1M2I1MjEzMWI5ZTA5MGM2Yjgz
14
+ ZDRlYjU2N2Q0YzYxOWIwMGZlNzBjNGU5YWZmYzY1YjhhMGYzOGE3YTZlN2Nl
15
+ MTc3N2U5MDliNjNlMzJmYTJhMDdkYjM5OTIzZTc0NjRhMGFkMmM=
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2014 Shuddhashil Ray(rayshuddhashil@gmail.com)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,43 @@
1
+ # ALACRITY
2
+
3
+ Alacrity is a simple Ruby Scraper, given a web page source url, alacrity finds all relevant information you want
4
+ for the search elements. Alacrity depends on Nokogiri gem and uses css selector inbuilt in Nokogiri.
5
+
6
+ ## Installation
7
+
8
+ Add gem 'alacrity' to Gemfile
9
+ Or
10
+ gem install skrape
11
+
12
+ ## How to use Alacrity
13
+
14
+ Lets say you have a source page url where-in a snippet is following:
15
+
16
+ <html>
17
+ <body>
18
+ <h3>I want to be scraped!</h3>
19
+ <h3>Dont forget to scrap me too!</h3>
20
+ </body>
21
+ </html>
22
+
23
+ Running Alacrity for searching elements 'h3' will return something like this:
24
+
25
+ {all_h3_tags => {0=>"I want to be scraped!",1=>{Dont forget to scrap me too!}}
26
+
27
+ ### Sample Run:
28
+
29
+ get_me_info = Alacrity::Source.new("http://some_url.com") do
30
+ fetch "all_h3_tags", :lookup=>'h3'
31
+ end
32
+
33
+ ## Custom procs and lambas!
34
+
35
+ Alacrity gets the text of all elements found by default, although you can run your own Procs with definition depending what you want your structured data to be, note the 'elem' inside your proc/lambda are Nokogiri::XML::Element, so read the documentation over at Nokogiri to see the methods and variables you have defined on Nokogiri::XML::Element
36
+
37
+ ### Sample Run:
38
+
39
+ get_me_info = Alacritys::Source.new("http://www.infibeam.com/") do
40
+ fetch "all_anchor_tags", :lookup=>'a',:post_fetch=>proc {|elem| elem.attributes["href"].value rescue nil}
41
+ end
42
+
43
+ get_me_info.structured_data["all_anchor_tags"] should give you all anchor tags links!
@@ -0,0 +1,14 @@
1
+ Gem::Specification.new do |f|
2
+ f.name = 'alacrity'
3
+ f.version = '1.0.0'
4
+ f.date = %q{2014-03-13}
5
+ f.summary = %q{Web Page Scraper written in Ruby}
6
+ f.description = %q{Web Page Scraper written in Ruby - Extracts any viable HTML DOM elements specified using CSS selectors into structured scraped data}
7
+ f.authors = %q{Shuddhashil Ray}
8
+ f.email = %q{rayshuddhashil@gmail.com}
9
+ f.files = ["README.md","LICENSE.txt","alacrity.gemspec","lib/alacrity.rb"]
10
+ f.require_paths = ["lib"]
11
+ f.homepage = %q{https://github.com/raycoding/alacrity}
12
+ f.license = "MIT"
13
+ f.add_dependency "nokogiri", "~> 1.5.11"
14
+ end
@@ -0,0 +1,34 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ module Alacrity
4
+ class HMTLDOMElementsNotFound < StandardError; end
5
+ class Source
6
+ attr_accessor :document,:structured_data,:errors
7
+ def initialize source_url, &block
8
+ @structured_data = {}
9
+ @errors=[]
10
+ @document = Nokogiri::HTML(open(source_url))
11
+ (block.arity < 1 ? (instance_eval &block) : block.call(self)) if block_given?
12
+ end
13
+
14
+ def fetch key_name, arguments
15
+ begin
16
+ page_elements = @document.css arguments[:lookup]
17
+ raise HMTLDOMElementsNotFound, "While parsing the page to find ('#{key_name}') we couldn't find any elements" if page_elements.empty?
18
+ @structured_data[key_name] = {}
19
+ if arguments[:post_fetch]
20
+ page_elements.each_with_index do |elem,index|
21
+ @structured_data[key_name][index] = arguments[:post_fetch].call(elem)
22
+ end
23
+ else
24
+ page_elements.each_with_index do |elem,index|
25
+ @structured_data[key_name][index] = elem.text
26
+ end
27
+ end
28
+ rescue HMTLDOMElementsNotFound => e
29
+ @structured_data[key_name] = {}
30
+ @errors << "#{e.to_s}"
31
+ end
32
+ end
33
+ end
34
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: alacrity
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Shuddhashil Ray
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.5.11
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.5.11
27
+ description: Web Page Scraper written in Ruby - Extracts any viable HTML DOM elements
28
+ specified using CSS selectors into structured scraped data
29
+ email: rayshuddhashil@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - LICENSE.txt
35
+ - README.md
36
+ - alacrity.gemspec
37
+ - lib/alacrity.rb
38
+ homepage: https://github.com/raycoding/alacrity
39
+ licenses:
40
+ - MIT
41
+ metadata: {}
42
+ post_install_message:
43
+ rdoc_options: []
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ requirements: []
57
+ rubyforge_project:
58
+ rubygems_version: 2.2.2
59
+ signing_key:
60
+ specification_version: 4
61
+ summary: Web Page Scraper written in Ruby
62
+ test_files: []