slmndr 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +2 -2
- data/lib/slmndr.rb +15 -27
- data.tar.gz.sig +0 -0
- metadata +86 -4
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ebd74e5e1f4bf29b4b8a01362c13eb8196e4ff5
|
4
|
+
data.tar.gz: 6053b2e0ec26ba5dd0df380060dd60bcf3a45185
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 77db1e9e5d4d8ba8e2ef70d1ccd87e4220f95a06e093405e43cd29d0f8f6ca8c9643d2872e10eb5ce38496796c650bcd0b0118c7a963dc363c05fa5b91ea5c18
|
7
|
+
data.tar.gz: 88ba03da2130c604382f9db2690da8542cf87d8e03c036823afc4dd89f0885a68f192827e5d6a6cc024736cc6a89289b605f54569b798aa11fbd21352ef27da9
|
checksums.yaml.gz.sig
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
�
|
2
|
-
|
1
|
+
�
|
2
|
+
�e]�
|
data/lib/slmndr.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# Salamander: A minimalistic ruby web crawling framework.
|
2
|
+
# Authored by: John Lawrence M. Penafiel
|
3
3
|
|
4
4
|
require 'time'
|
5
5
|
require 'thread'
|
@@ -12,21 +12,12 @@ require 'open_uri_redirections'
|
|
12
12
|
require 'nokogiri'
|
13
13
|
require 'addressable/uri'
|
14
14
|
|
15
|
-
|
16
|
-
## Salamander
|
17
|
-
## Description
|
18
|
-
## The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
|
19
|
-
## Functions
|
20
|
-
## Salamander::crawl
|
15
|
+
# The module containing the Salamander framework itself.
|
21
16
|
module Salamander
|
22
17
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
## Extracts outgoing links from the HTML pointed to by the given URL string.
|
27
|
-
## Parameters
|
28
|
-
## url - The URL of the HTML page the function is extracting links from.
|
29
|
-
## html - The HTML data to extract links from.
|
18
|
+
# Extracts outgoing links from the HTML pointed to by the given URL string.
|
19
|
+
# @param url The URL of the HTML page the function is extracting links from.
|
20
|
+
# @param html The HTML data to extract links from.
|
30
21
|
def self.get_links(url, html)
|
31
22
|
# Initialize
|
32
23
|
uri = Addressable::URI.parse(url)
|
@@ -63,18 +54,15 @@ module Salamander
|
|
63
54
|
end
|
64
55
|
end
|
65
56
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
## delay - Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
|
76
|
-
## threads - Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
|
77
|
-
## agent - Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
|
57
|
+
# Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
|
58
|
+
# Function blocks until all threads terminate.
|
59
|
+
# Optional Arguments (Place these inside the 'args' hash)
|
60
|
+
# visit: A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
|
61
|
+
# delay: A positive float indicating the number of seconds between requests in one thread. Defaults to 1.
|
62
|
+
# threads: A positive integer indicating the number of allowed simultaneous requests to the target web asset. Defaults to 1.
|
63
|
+
# agent: The user-agent string to be used. Defaults to "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)".
|
64
|
+
# @param urls A list of strings containing the seed URLs.
|
65
|
+
# @param args A hash containing optional arguments for the function.
|
78
66
|
def crawl(urls, args = {})
|
79
67
|
# Get arguments
|
80
68
|
visit = nil
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slmndr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Lawrence M. Penafiel
|
@@ -30,9 +30,91 @@ cert_chain:
|
|
30
30
|
kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
|
31
31
|
1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
|
32
32
|
-----END CERTIFICATE-----
|
33
|
-
date: 2015-06-
|
34
|
-
dependencies:
|
35
|
-
|
33
|
+
date: 2015-06-30 00:00:00.000000000 Z
|
34
|
+
dependencies:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: json
|
37
|
+
requirement: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '1.8'
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 1.8.3
|
45
|
+
type: :runtime
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - "~>"
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '1.8'
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.8.3
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: open_uri_redirections
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.2'
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: 0.2.1
|
65
|
+
type: :runtime
|
66
|
+
prerelease: false
|
67
|
+
version_requirements: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - "~>"
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0.2'
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: 0.2.1
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: nokogiri
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.6'
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 1.6.6.2
|
85
|
+
type: :runtime
|
86
|
+
prerelease: false
|
87
|
+
version_requirements: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - "~>"
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '1.6'
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: 1.6.6.2
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
name: addressable
|
97
|
+
requirement: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - "~>"
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '2.3'
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: 2.3.8
|
105
|
+
type: :runtime
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '2.3'
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: 2.3.8
|
115
|
+
description: |-
|
116
|
+
A minimalistic ruby web crawling framework.
|
117
|
+
See https://github.com/penafieljlm/slmndr for more information.
|
36
118
|
email: penafieljlm@gmail.com
|
37
119
|
executables: []
|
38
120
|
extensions: []
|
metadata.gz.sig
CHANGED
Binary file
|