apollo-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/crawler.rb ADDED
File without changes
data/lib/plugin.rb ADDED
@@ -0,0 +1,37 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawler
6
+ module Plugins
7
+ class Plugin
8
+
9
+ # Name of the plugin, used in docs, lookups, etc ...
10
+ def name
11
+ return "Plugin Base"
12
+ end
13
+
14
+ # - Fetch default URL (and transform it to document)
15
+ # - Extract and Load (Store) important data
16
+ # - Look for another documents
17
+ # Examples:
18
+ # - "next page"
19
+ # - "people you may know on Linked in"
20
+ # - "will attend on FB")
21
+ def run
22
+ return {
23
+ :plugin => self.class.name
24
+ }
25
+ end
26
+
27
+ # Extracts data from currently processed URL (called document here)
28
+ def extract_doc_data
29
+ end
30
+
31
+ # This function tries to get links of another URLs (called leaf here) to crawl
32
+ def fetch_leafs
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,36 @@
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Alexa < Plugin
8
+ @@URL = "http://www.alexa.com/"
9
+
10
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
+
12
+ def name()
13
+ return "Alexa Rank"
14
+ end
15
+
16
+ def run()
17
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
18
+ doc = Nokogiri::HTML(open(@@URL))
19
+
20
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
+ {
22
+ :text => i.text,
23
+ :link => URI.join(@@URL, i['href'])
24
+ }
25
+ }
26
+
27
+ return {
28
+ :plugin => self.class.name,
29
+ :title => doc.title,
30
+ :res => res
31
+ }
32
+ end
33
+ end
34
+ end # Plugins
35
+ end # Crawler
36
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Firmy < Plugin
8
+ @@URL = "http://www.firmy.cz/"
9
+
10
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
+
12
+ def name()
13
+ return "Firmy.cz"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Slashdot < Plugin
8
+ @@URL = "http://slashdot.org/"
9
+
10
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
11
+
12
+ def name
13
+ return "Slashdot"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class HackerNews < Plugin
8
+ @@URL = "http://news.ycombinator.com/"
9
+
10
+ @@MATCHER_ITEM = "//td[@class = 'title']/a"
11
+
12
+ def name
13
+ return "Hacker News"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
data/main.rb ADDED
@@ -0,0 +1,170 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require "bundler/setup"
5
+
6
+ require 'json'
7
+
8
+ require "thor"
9
+
10
+ require "open-uri"
11
+ require "nokogiri"
12
+
13
+ require "pp"
14
+ require "optparse"
15
+
16
+ module Crawler
17
+ class Program
18
+ # This hash will hold all of the options
19
+ # parsed from the command-line by
20
+ # OptionParser.
21
+ @options = nil
22
+ @optparser = nil
23
+ @plugins = nil
24
+
25
+ # Initializer - Constructor
26
+ def initialize
27
+ @plugins = {}
28
+ end
29
+
30
+ # Initialize command-line options
31
+ def init_options
32
+ @options = {}
33
+ @options[:verbose] = false
34
+
35
+ @optparser = OptionParser.new do | opts |
36
+ # This displays the help screen, all programs are
37
+ # assumed to have this option.
38
+ opts.on('-h', '--help', 'Display this screen') do
39
+ puts opts
40
+ exit
41
+ end
42
+
43
+ opts.on('-a', '--all', 'Run all plugins') do
44
+ @options[:run_all] = true
45
+ end
46
+
47
+ opts.on('-v', '--verbose', 'Enable verbose output') do
48
+ @options[:verbose] = true
49
+ end
50
+
51
+ opts.on('-l', '--list-plugins', 'List of plugins') do
52
+ @options[:list_plugins] = true
53
+ end
54
+ end
55
+ end
56
+
57
+ # Parse the options passed to command-line
58
+ def parse_options
59
+ # Parse the command-line. Remember there are two forms
60
+ # of the parse method. The 'parse' method simply parses
61
+ # ARGV, while the 'parse!' method parses ARGV and removes
62
+ # any options found there, as well as any parameters for
63
+ # the options. What's left is the list of files to resize.
64
+ @optparser.parse!
65
+ end
66
+
67
+ # Load global options first
68
+ # Merge it with local options (if they exists)
69
+ def load_config_file()
70
+ config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
71
+ puts "Inspecting #{config} ..."
72
+ if(File.exists?(config))
73
+ if(@options[:verbose])
74
+ puts "Loading config '#{config}'"
75
+ end
76
+
77
+ puts "Let's require '#{@options[:verbose]}'"
78
+ require config
79
+ else
80
+ if(@options[:verbose])
81
+ # TODO: Add support for initial rake task generation
82
+ # Something like this:
83
+ # rake config:init # Initializes config files with
84
+ # their defaults (if not exists already)
85
+ puts "Default config does not exist, skipping - '#{config}'"
86
+ end
87
+ end
88
+ end
89
+
90
+ # Register plugins (specific crawlers)
91
+ def register_plugins()
92
+ dir = File.join(File.dirname(__FILE__), "lib","plugins")
93
+ if(@options[:verbose])
94
+ puts "Registering plugins - '#{dir}'"
95
+ end
96
+
97
+ sites = File.join(dir, "**", "*.rb")
98
+ Dir.glob(sites).each do |site|
99
+ require site
100
+ end
101
+
102
+ tmp = Apollo::Crawler::Plugins.constants.select { |c|
103
+ Class === Apollo::Crawler::Plugins.const_get(c)
104
+ }
105
+
106
+ tmp.each do |x|
107
+ klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
108
+ @plugins.merge!({ x.downcase.to_s => klass})
109
+ end
110
+
111
+ if(@options[:verbose])
112
+ @plugins.each do |plugin, klass|
113
+ name = klass.new.class.name
114
+
115
+ if name == "Apollo::Crawler::Plugins::Plugin"
116
+ next
117
+ end
118
+
119
+ puts "Registered '#{plugin}' -> '#{name}'"
120
+ end
121
+ end
122
+ end
123
+
124
+ def run
125
+ init_options()
126
+
127
+ load_config_file()
128
+
129
+ parse_options()
130
+
131
+ # Register sites which can be crawled
132
+ register_plugins()
133
+
134
+ if(@options[:list_plugins])
135
+ puts "Listing plugins"
136
+ puts "----------------------------------------"
137
+ i = 0
138
+ @plugins.sort.each do |plugin, klass|
139
+ instance = klass.new
140
+ # puts klass.class_eval("@@NAME")
141
+ puts "(#{i}) #{plugin} - #{instance.name}"
142
+ i += 1
143
+ end
144
+ puts "----------------------------------------"
145
+ return
146
+ end
147
+
148
+ plugins = ARGV
149
+
150
+ if(@options[:run_all])
151
+ plugins = @plugins.keys
152
+ end
153
+
154
+ if(plugins.empty?)
155
+ puts @optparser
156
+ end
157
+
158
+ plugins.each do |plugin|
159
+ p = @plugins[plugin.downcase].new
160
+
161
+ # puts "Running '#{plugin}'"
162
+ puts JSON.pretty_generate(p.run)
163
+ end
164
+ end
165
+ end
166
+ end
167
+
168
+ if __FILE__ == $0
169
+ Crawler::Program.new.run()
170
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: apollo-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tomas Korcak
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-02-23 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Gem for crawling data from external resources
15
+ email: korczis@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - ./main.rb
21
+ - ./lib/crawler.rb
22
+ - ./lib/plugins/slashdot_org/slashdot.rb
23
+ - ./lib/plugins/firmy_cz/firmy.rb
24
+ - ./lib/plugins/alexa_com/alexa.rb
25
+ - ./lib/plugins/ycombinator_com/hacker_news.rb
26
+ - ./lib/plugin.rb
27
+ homepage: https://github.com/korczis/apollo-crawler
28
+ licenses: []
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.23
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: Apollo Platform Crawler
51
+ test_files: []