apollo-crawler 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
File without changes
@@ -0,0 +1,37 @@
1
+ require "open-uri"
2
+ require "nokogiri"
3
+
4
+ module Apollo
5
+ module Crawler
6
+ module Plugins
7
+ class Plugin
8
+
9
+ # Name of the plugin, used in docs, lookups, etc ...
10
+ def name
11
+ return "Plugin Base"
12
+ end
13
+
14
+ # - Fetch default URL (and transform it to document)
15
+ # - Extract and Load (Store) important data
16
+ # - Look for another documents
17
+ # Examples:
18
+ # - "next page"
19
+ # - "people you may know on Linked in"
20
+ # - "will attend on FB")
21
+ def run
22
+ return {
23
+ :plugin => self.class.name
24
+ }
25
+ end
26
+
27
+ # Extracts data from currently processed URL (called document here)
28
+ def extract_doc_data
29
+ end
30
+
31
+ # This function tries to get links of another URLs (called leaf here) to crawl
32
+ def fetch_leafs
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,36 @@
1
+ require File.join(F'..', '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Alexa < Plugin
8
+ @@URL = "http://www.alexa.com/"
9
+
10
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
+
12
+ def name()
13
+ return "Alexa Rank"
14
+ end
15
+
16
+ def run()
17
+ # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
18
+ doc = Nokogiri::HTML(open(@@URL))
19
+
20
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
+ {
22
+ :text => i.text,
23
+ :link => URI.join(@@URL, i['href'])
24
+ }
25
+ }
26
+
27
+ return {
28
+ :plugin => self.class.name,
29
+ :title => doc.title,
30
+ :res => res
31
+ }
32
+ end
33
+ end
34
+ end # Plugins
35
+ end # Crawler
36
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join('..', '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Firmy < Plugin
8
+ @@URL = "http://www.firmy.cz/"
9
+
10
+ @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
+
12
+ def name()
13
+ return "Firmy.cz"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join('..', '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class Slashdot < Plugin
8
+ @@URL = "http://slashdot.org/"
9
+
10
+ @@MATCHER_ITEM = "//article/header/h2/span/a"
11
+
12
+ def name
13
+ return "Slashdot"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
@@ -0,0 +1,35 @@
1
+ require File.join('..', '..', '..', 'plugin')
2
+
3
+ module Apollo
4
+ module Crawler
5
+ module Plugins
6
+ # PARAMATRIZE: Plugin class name
7
+ class HackerNews < Plugin
8
+ @@URL = "http://news.ycombinator.com/"
9
+
10
+ @@MATCHER_ITEM = "//td[@class = 'title']/a"
11
+
12
+ def name
13
+ return "Hacker News"
14
+ end
15
+
16
+ def run()
17
+ doc = Nokogiri::HTML(open(@@URL))
18
+
19
+ res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
+ {
21
+ :text => i.text,
22
+ :link => URI.join(@@URL, i['href'])
23
+ }
24
+ }
25
+
26
+ return {
27
+ :plugin => self.class.name,
28
+ :title => doc.title,
29
+ :res => res
30
+ }
31
+ end
32
+ end
33
+ end # Plugins
34
+ end # Crawler
35
+ end # Apollo
data/main.rb ADDED
@@ -0,0 +1,170 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require "bundler/setup"
5
+
6
+ require 'json'
7
+
8
+ require "thor"
9
+
10
+ require "open-uri"
11
+ require "nokogiri"
12
+
13
+ require "pp"
14
+ require "optparse"
15
+
16
+ module Crawler
17
+ class Program
18
+ # This hash will hold all of the options
19
+ # parsed from the command-line by
20
+ # OptionParser.
21
+ @options = nil
22
+ @optparser = nil
23
+ @plugins = nil
24
+
25
+ # Initializer - Constructor
26
+ def initialize
27
+ @plugins = {}
28
+ end
29
+
30
+ # Initialize command-line options
31
+ def init_options
32
+ @options = {}
33
+ @options[:verbose] = false
34
+
35
+ @optparser = OptionParser.new do | opts |
36
+ # This displays the help screen, all programs are
37
+ # assumed to have this option.
38
+ opts.on('-h', '--help', 'Display this screen') do
39
+ puts opts
40
+ exit
41
+ end
42
+
43
+ opts.on('-a', '--all', 'Run all plugins') do
44
+ @options[:run_all] = true
45
+ end
46
+
47
+ opts.on('-v', '--verbose', 'Enable verbose output') do
48
+ @options[:verbose] = true
49
+ end
50
+
51
+ opts.on('-l', '--list-plugins', 'List of plugins') do
52
+ @options[:list_plugins] = true
53
+ end
54
+ end
55
+ end
56
+
57
+ # Parse the options passed to command-line
58
+ def parse_options
59
+ # Parse the command-line. Remember there are two forms
60
+ # of the parse method. The 'parse' method simply parses
61
+ # ARGV, while the 'parse!' method parses ARGV and removes
62
+ # any options found there, as well as any parameters for
63
+ # the options. What's left is the list of files to resize.
64
+ @optparser.parse!
65
+ end
66
+
67
+ # Load global options first
68
+ # Merge it with local options (if they exists)
69
+ def load_config_file()
70
+ config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
71
+ puts "Inspecting #{config} ..."
72
+ if(File.exists?(config))
73
+ if(@options[:verbose])
74
+ puts "Loading config '#{config}'"
75
+ end
76
+
77
+ puts "Let's require '#{@options[:verbose]}'"
78
+ require config
79
+ else
80
+ if(@options[:verbose])
81
+ # TODO: Add support for initial rake task generation
82
+ # Something like this:
83
+ # rake config:init # Initializes config files with
84
+ # their defaults (if not exists already)
85
+ puts "Default config does not exist, skipping - '#{config}'"
86
+ end
87
+ end
88
+ end
89
+
90
+ # Register plugins (specific crawlers)
91
+ def register_plugins()
92
+ dir = File.join(File.dirname(__FILE__), "lib", "apollo_crawler", "plugins")
93
+ if(@options[:verbose])
94
+ puts "Registering plugins - '#{dir}'"
95
+ end
96
+
97
+ sites = File.join(dir, "**", "*.rb")
98
+ Dir.glob(sites).each do |site|
99
+ require site
100
+ end
101
+
102
+ tmp = Apollo::Crawler::Plugins.constants.select { |c|
103
+ Class === Apollo::Crawler::Plugins.const_get(c)
104
+ }
105
+
106
+ tmp.each do |x|
107
+ klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
108
+ @plugins.merge!({ x.downcase.to_s => klass})
109
+ end
110
+
111
+ if(@options[:verbose])
112
+ @plugins.each do |plugin, klass|
113
+ name = klass.new.class.name
114
+
115
+ if name == "Apollo::Crawler::Plugins::Plugin"
116
+ next
117
+ end
118
+
119
+ puts "Registered '#{plugin}' -> '#{name}'"
120
+ end
121
+ end
122
+ end
123
+
124
+ def run
125
+ init_options()
126
+
127
+ load_config_file()
128
+
129
+ parse_options()
130
+
131
+ # Register sites which can be crawled
132
+ register_plugins()
133
+
134
+ if(@options[:list_plugins])
135
+ puts "Listing plugins"
136
+ puts "----------------------------------------"
137
+ i = 0
138
+ @plugins.sort.each do |plugin, klass|
139
+ instance = klass.new
140
+ # puts klass.class_eval("@@NAME")
141
+ puts "(#{i}) #{plugin} - #{instance.name}"
142
+ i += 1
143
+ end
144
+ puts "----------------------------------------"
145
+ return
146
+ end
147
+
148
+ plugins = ARGV
149
+
150
+ if(@options[:run_all])
151
+ plugins = @plugins.keys
152
+ end
153
+
154
+ if(plugins.empty?)
155
+ puts @optparser
156
+ end
157
+
158
+ plugins.each do |plugin|
159
+ p = @plugins[plugin.downcase].new
160
+
161
+ # puts "Running '#{plugin}'"
162
+ puts JSON.pretty_generate(p.run)
163
+ end
164
+ end
165
+ end
166
+ end
167
+
168
+ if __FILE__ == $0
169
+ Crawler::Program.new.run()
170
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -16,7 +16,14 @@ email: korczis@gmail.com
16
16
  executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
- files: []
19
+ files:
20
+ - ./main.rb
21
+ - ./lib/apollo_crawler/crawler.rb
22
+ - ./lib/apollo_crawler/plugins/slashdot_org/slashdot.rb
23
+ - ./lib/apollo_crawler/plugins/firmy_cz/firmy.rb
24
+ - ./lib/apollo_crawler/plugins/alexa_com/alexa.rb
25
+ - ./lib/apollo_crawler/plugins/ycombinator_com/hacker_news.rb
26
+ - ./lib/apollo_crawler/plugin.rb
20
27
  homepage: https://github.com/korczis/apollo-crawler
21
28
  licenses: []
22
29
  post_install_message: