alacrity 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/LICENSE.txt +19 -0
- data/README.md +43 -0
- data/alacrity.gemspec +14 -0
- data/lib/alacrity.rb +34 -0
- metadata +62 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ODQ1NTI5Yjc3NzNjZWI2NDhiYTg1Yzk3MGZlNmEyZDRmMmE3NDI0NQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
NTRhNjQ5OWEwZDI0NjFlNzdkN2EyOGEwNDcyMWExYjE0OTIxMzZjNw==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NjhhNjQyYmM3NzhiZjQzMDk1ZjIwYjczMWYxM2JkZDQyYzE0ZTdjYzg1YTFl
|
10
|
+
ZGUwODMyMTY3MGJjMGJkZDYwMjc1NDQ0ZjFlZTc0OWQ1MzQ5MDk1NGY2OGRm
|
11
|
+
MWE1MWZlZTFlMzE3N2E4MTI4ZjMyMmQzMGViNGJkMjIxYzE2NjU=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
N2Y0MWI2ZmU0NjVjZmUyYTMxYTg2MDI3NjI1M2I1MjEzMWI5ZTA5MGM2Yjgz
|
14
|
+
ZDRlYjU2N2Q0YzYxOWIwMGZlNzBjNGU5YWZmYzY1YjhhMGYzOGE3YTZlN2Nl
|
15
|
+
MTc3N2U5MDliNjNlMzJmYTJhMDdkYjM5OTIzZTc0NjRhMGFkMmM=
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2014 Shuddhashil Ray(rayshuddhashil@gmail.com)
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# ALACRITY
|
2
|
+
|
3
|
+
Alacrity is a simple Ruby Scraper, given a web page source url, alacrity finds all relevant information you want
|
4
|
+
for the search elements. Alacrity depends on Nokogiri gem and uses css selector inbuilt in Nokogiri.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add gem 'alacrity' to Gemfile
|
9
|
+
Or
|
10
|
+
gem install skrape
|
11
|
+
|
12
|
+
## How to use Alacrity
|
13
|
+
|
14
|
+
Lets say you have a source page url where-in a snippet is following:
|
15
|
+
|
16
|
+
<html>
|
17
|
+
<body>
|
18
|
+
<h3>I want to be scraped!</h3>
|
19
|
+
<h3>Dont forget to scrap me too!</h3>
|
20
|
+
</body>
|
21
|
+
</html>
|
22
|
+
|
23
|
+
Running Alacrity for searching elements 'h3' will return something like this:
|
24
|
+
|
25
|
+
{all_h3_tags => {0=>"I want to be scraped!",1=>{Dont forget to scrap me too!}}
|
26
|
+
|
27
|
+
### Sample Run:
|
28
|
+
|
29
|
+
get_me_info = Alacrity::Source.new("http://some_url.com") do
|
30
|
+
fetch "all_h3_tags", :lookup=>'h3'
|
31
|
+
end
|
32
|
+
|
33
|
+
## Custom procs and lambas!
|
34
|
+
|
35
|
+
Alacrity gets the text of all elements found by default, although you can run your own Procs with definition depending what you want your structured data to be, note the 'elem' inside your proc/lambda are Nokogiri::XML::Element, so read the documentation over at Nokogiri to see the methods and variables you have defined on Nokogiri::XML::Element
|
36
|
+
|
37
|
+
### Sample Run:
|
38
|
+
|
39
|
+
get_me_info = Alacritys::Source.new("http://www.infibeam.com/") do
|
40
|
+
fetch "all_anchor_tags", :lookup=>'a',:post_fetch=>proc {|elem| elem.attributes["href"].value rescue nil}
|
41
|
+
end
|
42
|
+
|
43
|
+
get_me_info.structured_data["all_anchor_tags"] should give you all anchor tags links!
|
data/alacrity.gemspec
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Gem::Specification.new do |f|
|
2
|
+
f.name = 'alacrity'
|
3
|
+
f.version = '1.0.0'
|
4
|
+
f.date = %q{2014-03-13}
|
5
|
+
f.summary = %q{Web Page Scraper written in Ruby}
|
6
|
+
f.description = %q{Web Page Scraper written in Ruby - Extracts any viable HTML DOM elements specified using CSS selectors into structured scraped data}
|
7
|
+
f.authors = %q{Shuddhashil Ray}
|
8
|
+
f.email = %q{rayshuddhashil@gmail.com}
|
9
|
+
f.files = ["README.md","LICENSE.txt","alacrity.gemspec","lib/alacrity.rb"]
|
10
|
+
f.require_paths = ["lib"]
|
11
|
+
f.homepage = %q{https://github.com/raycoding/alacrity}
|
12
|
+
f.license = "MIT"
|
13
|
+
f.add_dependency "nokogiri", "~> 1.5.11"
|
14
|
+
end
|
data/lib/alacrity.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
module Alacrity
|
4
|
+
class HMTLDOMElementsNotFound < StandardError; end
|
5
|
+
class Source
|
6
|
+
attr_accessor :document,:structured_data,:errors
|
7
|
+
def initialize source_url, &block
|
8
|
+
@structured_data = {}
|
9
|
+
@errors=[]
|
10
|
+
@document = Nokogiri::HTML(open(source_url))
|
11
|
+
(block.arity < 1 ? (instance_eval &block) : block.call(self)) if block_given?
|
12
|
+
end
|
13
|
+
|
14
|
+
def fetch key_name, arguments
|
15
|
+
begin
|
16
|
+
page_elements = @document.css arguments[:lookup]
|
17
|
+
raise HMTLDOMElementsNotFound, "While parsing the page to find ('#{key_name}') we couldn't find any elements" if page_elements.empty?
|
18
|
+
@structured_data[key_name] = {}
|
19
|
+
if arguments[:post_fetch]
|
20
|
+
page_elements.each_with_index do |elem,index|
|
21
|
+
@structured_data[key_name][index] = arguments[:post_fetch].call(elem)
|
22
|
+
end
|
23
|
+
else
|
24
|
+
page_elements.each_with_index do |elem,index|
|
25
|
+
@structured_data[key_name][index] = elem.text
|
26
|
+
end
|
27
|
+
end
|
28
|
+
rescue HMTLDOMElementsNotFound => e
|
29
|
+
@structured_data[key_name] = {}
|
30
|
+
@errors << "#{e.to_s}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: alacrity
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shuddhashil Ray
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.5.11
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.5.11
|
27
|
+
description: Web Page Scraper written in Ruby - Extracts any viable HTML DOM elements
|
28
|
+
specified using CSS selectors into structured scraped data
|
29
|
+
email: rayshuddhashil@gmail.com
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- LICENSE.txt
|
35
|
+
- README.md
|
36
|
+
- alacrity.gemspec
|
37
|
+
- lib/alacrity.rb
|
38
|
+
homepage: https://github.com/raycoding/alacrity
|
39
|
+
licenses:
|
40
|
+
- MIT
|
41
|
+
metadata: {}
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
requirements: []
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.2.2
|
59
|
+
signing_key:
|
60
|
+
specification_version: 4
|
61
|
+
summary: Web Page Scraper written in Ruby
|
62
|
+
test_files: []
|