apollo-crawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/crawler.rb +0 -0
- data/lib/plugin.rb +37 -0
- data/lib/plugins/alexa_com/alexa.rb +36 -0
- data/lib/plugins/firmy_cz/firmy.rb +35 -0
- data/lib/plugins/slashdot_org/slashdot.rb +35 -0
- data/lib/plugins/ycombinator_com/hacker_news.rb +35 -0
- data/main.rb +170 -0
- metadata +51 -0
data/lib/crawler.rb
ADDED
File without changes
|
data/lib/plugin.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "nokogiri"
|
3
|
+
|
4
|
+
module Apollo
|
5
|
+
module Crawler
|
6
|
+
module Plugins
|
7
|
+
class Plugin
|
8
|
+
|
9
|
+
# Name of the plugin, used in docs, lookups, etc ...
|
10
|
+
def name
|
11
|
+
return "Plugin Base"
|
12
|
+
end
|
13
|
+
|
14
|
+
# - Fetch default URL (and transform it to document)
|
15
|
+
# - Extract and Load (Store) important data
|
16
|
+
# - Look for another documents
|
17
|
+
# Examples:
|
18
|
+
# - "next page"
|
19
|
+
# - "people you may know on Linked in"
|
20
|
+
# - "will attend on FB")
|
21
|
+
def run
|
22
|
+
return {
|
23
|
+
:plugin => self.class.name
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
# Extracts data from currently processed URL (called document here)
|
28
|
+
def extract_doc_data
|
29
|
+
end
|
30
|
+
|
31
|
+
# This function tries to get links of another URLs (called leaf here) to crawl
|
32
|
+
def fetch_leafs
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
module Plugins
|
6
|
+
# PARAMATRIZE: Plugin class name
|
7
|
+
class Alexa < Plugin
|
8
|
+
@@URL = "http://www.alexa.com/"
|
9
|
+
|
10
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
11
|
+
|
12
|
+
def name()
|
13
|
+
return "Alexa Rank"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run()
|
17
|
+
# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
|
18
|
+
doc = Nokogiri::HTML(open(@@URL))
|
19
|
+
|
20
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
21
|
+
{
|
22
|
+
:text => i.text,
|
23
|
+
:link => URI.join(@@URL, i['href'])
|
24
|
+
}
|
25
|
+
}
|
26
|
+
|
27
|
+
return {
|
28
|
+
:plugin => self.class.name,
|
29
|
+
:title => doc.title,
|
30
|
+
:res => res
|
31
|
+
}
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # Plugins
|
35
|
+
end # Crawler
|
36
|
+
end # Apollo
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
module Plugins
|
6
|
+
# PARAMATRIZE: Plugin class name
|
7
|
+
class Firmy < Plugin
|
8
|
+
@@URL = "http://www.firmy.cz/"
|
9
|
+
|
10
|
+
@@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
|
11
|
+
|
12
|
+
def name()
|
13
|
+
return "Firmy.cz"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run()
|
17
|
+
doc = Nokogiri::HTML(open(@@URL))
|
18
|
+
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(@@URL, i['href'])
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
return {
|
27
|
+
:plugin => self.class.name,
|
28
|
+
:title => doc.title,
|
29
|
+
:res => res
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end # Plugins
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
module Plugins
|
6
|
+
# PARAMATRIZE: Plugin class name
|
7
|
+
class Slashdot < Plugin
|
8
|
+
@@URL = "http://slashdot.org/"
|
9
|
+
|
10
|
+
@@MATCHER_ITEM = "//article/header/h2/span/a"
|
11
|
+
|
12
|
+
def name
|
13
|
+
return "Slashdot"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run()
|
17
|
+
doc = Nokogiri::HTML(open(@@URL))
|
18
|
+
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(@@URL, i['href'])
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
return {
|
27
|
+
:plugin => self.class.name,
|
28
|
+
:title => doc.title,
|
29
|
+
:res => res
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end # Plugins
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
|
2
|
+
|
3
|
+
module Apollo
|
4
|
+
module Crawler
|
5
|
+
module Plugins
|
6
|
+
# PARAMATRIZE: Plugin class name
|
7
|
+
class HackerNews < Plugin
|
8
|
+
@@URL = "http://news.ycombinator.com/"
|
9
|
+
|
10
|
+
@@MATCHER_ITEM = "//td[@class = 'title']/a"
|
11
|
+
|
12
|
+
def name
|
13
|
+
return "Hacker News"
|
14
|
+
end
|
15
|
+
|
16
|
+
def run()
|
17
|
+
doc = Nokogiri::HTML(open(@@URL))
|
18
|
+
|
19
|
+
res = doc.xpath(@@MATCHER_ITEM).map { |i|
|
20
|
+
{
|
21
|
+
:text => i.text,
|
22
|
+
:link => URI.join(@@URL, i['href'])
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
return {
|
27
|
+
:plugin => self.class.name,
|
28
|
+
:title => doc.title,
|
29
|
+
:res => res
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end # Plugins
|
34
|
+
end # Crawler
|
35
|
+
end # Apollo
|
data/main.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require "bundler/setup"
|
5
|
+
|
6
|
+
require 'json'
|
7
|
+
|
8
|
+
require "thor"
|
9
|
+
|
10
|
+
require "open-uri"
|
11
|
+
require "nokogiri"
|
12
|
+
|
13
|
+
require "pp"
|
14
|
+
require "optparse"
|
15
|
+
|
16
|
+
module Crawler
|
17
|
+
class Program
|
18
|
+
# This hash will hold all of the options
|
19
|
+
# parsed from the command-line by
|
20
|
+
# OptionParser.
|
21
|
+
@options = nil
|
22
|
+
@optparser = nil
|
23
|
+
@plugins = nil
|
24
|
+
|
25
|
+
# Initializer - Constructor
|
26
|
+
def initialize
|
27
|
+
@plugins = {}
|
28
|
+
end
|
29
|
+
|
30
|
+
# Initialize command-line options
|
31
|
+
def init_options
|
32
|
+
@options = {}
|
33
|
+
@options[:verbose] = false
|
34
|
+
|
35
|
+
@optparser = OptionParser.new do | opts |
|
36
|
+
# This displays the help screen, all programs are
|
37
|
+
# assumed to have this option.
|
38
|
+
opts.on('-h', '--help', 'Display this screen') do
|
39
|
+
puts opts
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-a', '--all', 'Run all plugins') do
|
44
|
+
@options[:run_all] = true
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on('-v', '--verbose', 'Enable verbose output') do
|
48
|
+
@options[:verbose] = true
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on('-l', '--list-plugins', 'List of plugins') do
|
52
|
+
@options[:list_plugins] = true
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parse the options passed to command-line
|
58
|
+
def parse_options
|
59
|
+
# Parse the command-line. Remember there are two forms
|
60
|
+
# of the parse method. The 'parse' method simply parses
|
61
|
+
# ARGV, while the 'parse!' method parses ARGV and removes
|
62
|
+
# any options found there, as well as any parameters for
|
63
|
+
# the options. What's left is the list of files to resize.
|
64
|
+
@optparser.parse!
|
65
|
+
end
|
66
|
+
|
67
|
+
# Load global options first
|
68
|
+
# Merge it with local options (if they exists)
|
69
|
+
def load_config_file()
|
70
|
+
config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
|
71
|
+
puts "Inspecting #{config} ..."
|
72
|
+
if(File.exists?(config))
|
73
|
+
if(@options[:verbose])
|
74
|
+
puts "Loading config '#{config}'"
|
75
|
+
end
|
76
|
+
|
77
|
+
puts "Let's require '#{@options[:verbose]}'"
|
78
|
+
require config
|
79
|
+
else
|
80
|
+
if(@options[:verbose])
|
81
|
+
# TODO: Add support for initial rake task generation
|
82
|
+
# Something like this:
|
83
|
+
# rake config:init # Initializes config files with
|
84
|
+
# their defaults (if not exists already)
|
85
|
+
puts "Default config does not exist, skipping - '#{config}'"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Register plugins (specific crawlers)
|
91
|
+
def register_plugins()
|
92
|
+
dir = File.join(File.dirname(__FILE__), "lib","plugins")
|
93
|
+
if(@options[:verbose])
|
94
|
+
puts "Registering plugins - '#{dir}'"
|
95
|
+
end
|
96
|
+
|
97
|
+
sites = File.join(dir, "**", "*.rb")
|
98
|
+
Dir.glob(sites).each do |site|
|
99
|
+
require site
|
100
|
+
end
|
101
|
+
|
102
|
+
tmp = Apollo::Crawler::Plugins.constants.select { |c|
|
103
|
+
Class === Apollo::Crawler::Plugins.const_get(c)
|
104
|
+
}
|
105
|
+
|
106
|
+
tmp.each do |x|
|
107
|
+
klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
|
108
|
+
@plugins.merge!({ x.downcase.to_s => klass})
|
109
|
+
end
|
110
|
+
|
111
|
+
if(@options[:verbose])
|
112
|
+
@plugins.each do |plugin, klass|
|
113
|
+
name = klass.new.class.name
|
114
|
+
|
115
|
+
if name == "Apollo::Crawler::Plugins::Plugin"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
|
119
|
+
puts "Registered '#{plugin}' -> '#{name}'"
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def run
|
125
|
+
init_options()
|
126
|
+
|
127
|
+
load_config_file()
|
128
|
+
|
129
|
+
parse_options()
|
130
|
+
|
131
|
+
# Register sites which can be crawled
|
132
|
+
register_plugins()
|
133
|
+
|
134
|
+
if(@options[:list_plugins])
|
135
|
+
puts "Listing plugins"
|
136
|
+
puts "----------------------------------------"
|
137
|
+
i = 0
|
138
|
+
@plugins.sort.each do |plugin, klass|
|
139
|
+
instance = klass.new
|
140
|
+
# puts klass.class_eval("@@NAME")
|
141
|
+
puts "(#{i}) #{plugin} - #{instance.name}"
|
142
|
+
i += 1
|
143
|
+
end
|
144
|
+
puts "----------------------------------------"
|
145
|
+
return
|
146
|
+
end
|
147
|
+
|
148
|
+
plugins = ARGV
|
149
|
+
|
150
|
+
if(@options[:run_all])
|
151
|
+
plugins = @plugins.keys
|
152
|
+
end
|
153
|
+
|
154
|
+
if(plugins.empty?)
|
155
|
+
puts @optparser
|
156
|
+
end
|
157
|
+
|
158
|
+
plugins.each do |plugin|
|
159
|
+
p = @plugins[plugin.downcase].new
|
160
|
+
|
161
|
+
# puts "Running '#{plugin}'"
|
162
|
+
puts JSON.pretty_generate(p.run)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
if __FILE__ == $0
|
169
|
+
Crawler::Program.new.run()
|
170
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: apollo-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tomas Korcak
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-02-23 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Gem for crawling data from external resources
|
15
|
+
email: korczis@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- ./main.rb
|
21
|
+
- ./lib/crawler.rb
|
22
|
+
- ./lib/plugins/slashdot_org/slashdot.rb
|
23
|
+
- ./lib/plugins/firmy_cz/firmy.rb
|
24
|
+
- ./lib/plugins/alexa_com/alexa.rb
|
25
|
+
- ./lib/plugins/ycombinator_com/hacker_news.rb
|
26
|
+
- ./lib/plugin.rb
|
27
|
+
homepage: https://github.com/korczis/apollo-crawler
|
28
|
+
licenses: []
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.23
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: Apollo Platform Crawler
|
51
|
+
test_files: []
|