apollo-crawler 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,20 +1,20 @@
1
- # Main
2
- require 'apollo_crawler/cache'
3
- require 'apollo_crawler/crawler'
4
- require 'apollo_crawler/formatter'
5
-
6
- # Caches
7
- require 'apollo_crawler/caches/factory'
8
- require 'apollo_crawler/caches/memory_cache'
9
- require 'apollo_crawler/caches/null_cache'
10
-
11
- # Crawlers
12
- require 'apollo_crawler/crawlers/alexa_com/alexa'
13
- require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
- require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
- require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
-
17
- # Formatters
18
- require 'apollo_crawler/formatters/formatter_json'
19
- require 'apollo_crawler/formatters/formatter_plain'
20
- require 'apollo_crawler/formatters/formatter_table'
1
+ # Main
2
+ require 'apollo_crawler/cache'
3
+ require 'apollo_crawler/crawler'
4
+ require 'apollo_crawler/formatter'
5
+
6
+ # Caches
7
+ require 'apollo_crawler/caches/factory'
8
+ require 'apollo_crawler/caches/memory_cache'
9
+ require 'apollo_crawler/caches/null_cache'
10
+
11
+ # Crawlers
12
+ require 'apollo_crawler/crawlers/alexa_com/alexa'
13
+ require 'apollo_crawler/crawlers/firmy_cz/firmy'
14
+ require 'apollo_crawler/crawlers/slashdot_org/slashdot'
15
+ require 'apollo_crawler/crawlers/ycombinator_com/hacker_news'
16
+
17
+ # Formatters
18
+ require 'apollo_crawler/formatters/formatter_json'
19
+ require 'apollo_crawler/formatters/formatter_plain'
20
+ require 'apollo_crawler/formatters/formatter_table'
@@ -1,34 +1,34 @@
1
- module Apollo
2
- module Caches
3
- class Cache
4
- # Get value associated with key from cache
5
- def get(key, *args)
6
-
7
- # Not found, Create, cache and return
8
- res = yield args
9
- return res
10
- end
11
-
12
- # Set value associated with key
13
- # Return cached value
14
- def set(key, value)
15
- return value
16
- end
17
-
18
- # Check if cache contains specified key
19
- def contains(key)
20
- return false
21
- end
22
-
23
- # Invalidate key/value pair
24
- def invalidate(key)
25
- return true
26
- end
27
-
28
- # Clear cache
29
- def clear
30
- return
31
- end
32
- end # Cache
33
- end # Caches
34
- end # Apollo
1
+ module Apollo
2
+ module Caches
3
+ class Cache
4
+ # Get value associated with key from cache
5
+ def get(key, *args)
6
+
7
+ # Not found, Create, cache and return
8
+ res = yield args
9
+ return res
10
+ end
11
+
12
+ # Set value associated with key
13
+ # Return cached value
14
+ def set(key, value)
15
+ return value
16
+ end
17
+
18
+ # Check if cache contains specified key
19
+ def contains(key)
20
+ return false
21
+ end
22
+
23
+ # Invalidate key/value pair
24
+ def invalidate(key)
25
+ return true
26
+ end
27
+
28
+ # Clear cache
29
+ def clear
30
+ return
31
+ end
32
+ end # Cache
33
+ end # Caches
34
+ end # Apollo
@@ -1,18 +1,18 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
- require 'singleton'
3
-
4
- module Apollo
5
- module Caches
6
- class Factory
7
- include Singleton
8
-
9
- def self.construct()
10
- self.singleton.construct()
11
- end
12
-
13
- def construct()
14
- Memory.new()
15
- end
16
- end # Factory
17
- end # Caches
18
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+ require 'singleton'
3
+
4
+ module Apollo
5
+ module Caches
6
+ class Factory
7
+ include Singleton
8
+
9
+ def self.construct()
10
+ self.singleton.construct()
11
+ end
12
+
13
+ def construct()
14
+ Memory.new()
15
+ end
16
+ end # Factory
17
+ end # Caches
18
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Filesystem < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Filesystem
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Filesystem < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Filesystem
29
+ end # Caches
30
+ end # Apollo
@@ -1,43 +1,43 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Memory < Cache
6
- @storage = nil
7
-
8
- def initialize
9
- @storage = {}
10
- end
11
-
12
- # Get value associated with key from cache
13
- def get(key, *args)
14
- @storage[key]
15
-
16
- # Not found, Create, cache and return
17
- res = yield args
18
- return res
19
- end
20
-
21
- # Set value associated with key
22
- # Return cached value
23
- def set(key, value)
24
- @storage[key] = value
25
- end
26
-
27
- # Check if cache contains specified key
28
- def contains(key)
29
- @storage.has_key?(key)
30
- end
31
-
32
- # Invalidate key/value pair
33
- def invalidate(key)
34
- @storage.delete(key)
35
- end
36
-
37
- # Clear cache
38
- def clear
39
- @storage.clear
40
- end
41
- end # Null
42
- end # Caches
43
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Memory < Cache
6
+ @storage = nil
7
+
8
+ def initialize
9
+ @storage = {}
10
+ end
11
+
12
+ # Get value associated with key from cache
13
+ def get(key, *args)
14
+ @storage[key]
15
+
16
+ # Not found, Create, cache and return
17
+ res = yield args
18
+ return res
19
+ end
20
+
21
+ # Set value associated with key
22
+ # Return cached value
23
+ def set(key, value)
24
+ @storage[key] = value
25
+ end
26
+
27
+ # Check if cache contains specified key
28
+ def contains(key)
29
+ @storage.has_key?(key)
30
+ end
31
+
32
+ # Invalidate key/value pair
33
+ def invalidate(key)
34
+ @storage.delete(key)
35
+ end
36
+
37
+ # Clear cache
38
+ def clear
39
+ @storage.clear
40
+ end
41
+ end # Null
42
+ end # Caches
43
+ end # Apollo
@@ -1,30 +1,30 @@
1
- require File.join(File.dirname(__FILE__), '..', 'cache')
2
-
3
- module Apollo
4
- module Caches
5
- class Null < Cache
6
- # Get value associated with key from cache
7
- def get(key, *args)
8
- # Not found, Create, cache and return
9
- res = yield args
10
- return res
11
- end
12
-
13
- # Set value associated with key
14
- # Return cached value
15
- def set(key, value)
16
- return value
17
- end
18
-
19
- # Check if cache contains specified key
20
- def contains(key)
21
- return false
22
- end
23
-
24
- # Invalidate key/value pair
25
- def invalidate(key)
26
- return true
27
- end
28
- end # Null
29
- end # Caches
30
- end # Apollo
1
+ require File.join(File.dirname(__FILE__), '..', 'cache')
2
+
3
+ module Apollo
4
+ module Caches
5
+ class Null < Cache
6
+ # Get value associated with key from cache
7
+ def get(key, *args)
8
+ # Not found, Create, cache and return
9
+ res = yield args
10
+ return res
11
+ end
12
+
13
+ # Set value associated with key
14
+ # Return cached value
15
+ def set(key, value)
16
+ return value
17
+ end
18
+
19
+ # Check if cache contains specified key
20
+ def contains(key)
21
+ return false
22
+ end
23
+
24
+ # Invalidate key/value pair
25
+ def invalidate(key)
26
+ return true
27
+ end
28
+ end # Null
29
+ end # Caches
30
+ end # Apollo
@@ -1,129 +1,129 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawlers
6
- class Crawler
7
- @backlog = nil
8
-
9
- def initialize
10
- @backlog = []
11
- end
12
-
13
- # Name of the crawler
14
- def name
15
- return "Crawler Base"
16
- end
17
-
18
- def url
19
- return nil
20
- end
21
-
22
- # - (0) Figure out URL
23
- # - (1) Extract Data
24
- # - (2) Extract Links
25
- # - (3) Go to (0) eventually
26
- def etl(url=nil)
27
- # Look for passed URL use default instead and fail if it is not valid
28
- if(url.nil? || url.empty?)
29
- url = self.url
30
- end
31
-
32
- if(url.nil?)
33
- return nil
34
- end
35
-
36
- if(url.kind_of?(Array))
37
- @backlog.concat(url)
38
- else
39
- @backlog << url
40
- end
41
-
42
- res = []
43
- # TODO: Respect limit of documents/urls processed
44
- while(@backlog.empty? == false)
45
- url = @backlog.shift
46
-
47
- # puts "Processing '#{url}'"
48
- doc = self.process_url(url)
49
- res << doc
50
-
51
- # TODO: Use log4r and log it only on info level
52
- puts doc.inspect
53
-
54
- if(!doc.nil? && !doc.empty?)
55
- doc[:links].each do |link|
56
- url = link[:link].to_s
57
- # TODO: Use log4r and log it only on info level
58
- #puts url
59
-
60
- # TODO: Check if it is unique
61
- @backlog << url
62
- end
63
- end
64
- end
65
- return res
66
- end
67
-
68
- def process_url(url)
69
- # Try fetch document
70
- doc = self.fetch_document(url)
71
- if(doc.nil?)
72
- return nil
73
- end
74
-
75
- # Try extract data from document
76
- data = self.extract_data(doc)
77
-
78
- # Try extract links for another documents
79
- links = self.extract_links(doc)
80
-
81
- # Format ETL result
82
- res = {
83
- :crawler => self.class.name,
84
- :title => doc.title,
85
- :data => data,
86
- :links => links
87
- }
88
-
89
- # TODO: Add some async/callback signal for document processed
90
-
91
- return res
92
- end
93
-
94
- # Fetch document
95
- def fetch_document(url)
96
- # TODO: Refactor following idiom
97
- if(url == nil)
98
- url = self.url
99
- end
100
-
101
- if(url.nil?)
102
- return nil
103
- end
104
-
105
- # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
- cache = Apollo::Caches::Factory.instance.construct
107
- raw = cache.get(url) do
108
- open(url).read
109
- end
110
-
111
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
112
- doc = Nokogiri::HTML(raw)
113
- return doc
114
- end
115
-
116
- # Extracts data from document
117
- def extract_data(doc)
118
- res = []
119
- return res
120
- end
121
-
122
- # Extract links to another documents from this document
123
- def extract_links(doc)
124
- res = []
125
- return res
126
- end
127
- end
128
- end
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawlers
6
+ class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
12
+
13
+ # Name of the crawler
14
+ def name
15
+ return "Crawler Base"
16
+ end
17
+
18
+ def url
19
+ return nil
20
+ end
21
+
22
+ # - (0) Figure out URL
23
+ # - (1) Extract Data
24
+ # - (2) Extract Links
25
+ # - (3) Go to (0) eventually
26
+ def etl(url=nil)
27
+ # Look for passed URL use default instead and fail if it is not valid
28
+ if(url.nil? || url.empty?)
29
+ url = self.url
30
+ end
31
+
32
+ if(url.nil?)
33
+ return nil
34
+ end
35
+
36
+ if(url.kind_of?(Array))
37
+ @backlog.concat(url)
38
+ else
39
+ @backlog << url
40
+ end
41
+
42
+ res = []
43
+ # TODO: Respect limit of documents/urls processed
44
+ while(@backlog.empty? == false)
45
+ url = @backlog.shift
46
+
47
+ # puts "Processing '#{url}'"
48
+ doc = self.process_url(url)
49
+ res << doc
50
+
51
+ # TODO: Use log4r and log it only on info level
52
+ puts doc.inspect
53
+
54
+ if(!doc.nil? && !doc.empty?)
55
+ doc[:links].each do |link|
56
+ url = link[:link].to_s
57
+ # TODO: Use log4r and log it only on info level
58
+ #puts url
59
+
60
+ # TODO: Check if it is unique
61
+ @backlog << url
62
+ end
63
+ end
64
+ end
65
+ return res
66
+ end
67
+
68
+ def process_url(url)
69
+ # Try fetch document
70
+ doc = self.fetch_document(url)
71
+ if(doc.nil?)
72
+ return nil
73
+ end
74
+
75
+ # Try extract data from document
76
+ data = self.extract_data(doc)
77
+
78
+ # Try extract links for another documents
79
+ links = self.extract_links(doc)
80
+
81
+ # Format ETL result
82
+ res = {
83
+ :crawler => self.class.name,
84
+ :title => doc.title,
85
+ :data => data,
86
+ :links => links
87
+ }
88
+
89
+ # TODO: Add some async/callback signal for document processed
90
+
91
+ return res
92
+ end
93
+
94
+ # Fetch document
95
+ def fetch_document(url)
96
+ # TODO: Refactor following idiom
97
+ if(url == nil)
98
+ url = self.url
99
+ end
100
+
101
+ if(url.nil?)
102
+ return nil
103
+ end
104
+
105
+ # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
106
+ cache = Apollo::Caches::Factory.instance.construct
107
+ raw = cache.get(url) do
108
+ open(url).read
109
+ end
110
+
111
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
112
+ doc = Nokogiri::HTML(raw)
113
+ return doc
114
+ end
115
+
116
+ # Extracts data from document
117
+ def extract_data(doc)
118
+ res = []
119
+ return res
120
+ end
121
+
122
+ # Extract links to another documents from this document
123
+ def extract_links(doc)
124
+ res = []
125
+ return res
126
+ end
127
+ end
128
+ end
129
129
  end