apollo-crawler 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -16,14 +16,7 @@ email: korczis@gmail.com
16
16
  executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
- files:
20
- - ./main.rb
21
- - ./lib/crawler.rb
22
- - ./lib/plugins/slashdot_org/slashdot.rb
23
- - ./lib/plugins/firmy_cz/firmy.rb
24
- - ./lib/plugins/alexa_com/alexa.rb
25
- - ./lib/plugins/ycombinator_com/hacker_news.rb
26
- - ./lib/plugin.rb
19
+ files: []
27
20
  homepage: https://github.com/korczis/apollo-crawler
28
21
  licenses: []
29
22
  post_install_message:
data/lib/crawler.rb DELETED
File without changes
data/lib/plugin.rb DELETED
@@ -1,37 +0,0 @@
1
- require "open-uri"
2
- require "nokogiri"
3
-
4
- module Apollo
5
- module Crawler
6
- module Plugins
7
- class Plugin
8
-
9
- # Name of the plugin, used in docs, lookups, etc ...
10
- def name
11
- return "Plugin Base"
12
- end
13
-
14
- # - Fetch default URL (and transform it to document)
15
- # - Extract and Load (Store) important data
16
- # - Look for another documents
17
- # Examples:
18
- # - "next page"
19
- # - "people you may know on Linked in"
20
- # - "will attend on FB")
21
- def run
22
- return {
23
- :plugin => self.class.name
24
- }
25
- end
26
-
27
- # Extracts data from currently processed URL (called document here)
28
- def extract_doc_data
29
- end
30
-
31
- # This function tries to get links of another URLs (called leaf here) to crawl
32
- def fetch_leafs
33
- end
34
- end
35
- end
36
- end
37
- end
@@ -1,36 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
-
3
- module Apollo
4
- module Crawler
5
- module Plugins
6
- # PARAMATRIZE: Plugin class name
7
- class Alexa < Plugin
8
- @@URL = "http://www.alexa.com/"
9
-
10
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
-
12
- def name()
13
- return "Alexa Rank"
14
- end
15
-
16
- def run()
17
- # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
18
- doc = Nokogiri::HTML(open(@@URL))
19
-
20
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
21
- {
22
- :text => i.text,
23
- :link => URI.join(@@URL, i['href'])
24
- }
25
- }
26
-
27
- return {
28
- :plugin => self.class.name,
29
- :title => doc.title,
30
- :res => res
31
- }
32
- end
33
- end
34
- end # Plugins
35
- end # Crawler
36
- end # Apollo
@@ -1,35 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
-
3
- module Apollo
4
- module Crawler
5
- module Plugins
6
- # PARAMATRIZE: Plugin class name
7
- class Firmy < Plugin
8
- @@URL = "http://www.firmy.cz/"
9
-
10
- @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
11
-
12
- def name()
13
- return "Firmy.cz"
14
- end
15
-
16
- def run()
17
- doc = Nokogiri::HTML(open(@@URL))
18
-
19
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
- {
21
- :text => i.text,
22
- :link => URI.join(@@URL, i['href'])
23
- }
24
- }
25
-
26
- return {
27
- :plugin => self.class.name,
28
- :title => doc.title,
29
- :res => res
30
- }
31
- end
32
- end
33
- end # Plugins
34
- end # Crawler
35
- end # Apollo
@@ -1,35 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
-
3
- module Apollo
4
- module Crawler
5
- module Plugins
6
- # PARAMATRIZE: Plugin class name
7
- class Slashdot < Plugin
8
- @@URL = "http://slashdot.org/"
9
-
10
- @@MATCHER_ITEM = "//article/header/h2/span/a"
11
-
12
- def name
13
- return "Slashdot"
14
- end
15
-
16
- def run()
17
- doc = Nokogiri::HTML(open(@@URL))
18
-
19
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
- {
21
- :text => i.text,
22
- :link => URI.join(@@URL, i['href'])
23
- }
24
- }
25
-
26
- return {
27
- :plugin => self.class.name,
28
- :title => doc.title,
29
- :res => res
30
- }
31
- end
32
- end
33
- end # Plugins
34
- end # Crawler
35
- end # Apollo
@@ -1,35 +0,0 @@
1
- require File.join(File.dirname(__FILE__), '..', '..', 'plugin')
2
-
3
- module Apollo
4
- module Crawler
5
- module Plugins
6
- # PARAMATRIZE: Plugin class name
7
- class HackerNews < Plugin
8
- @@URL = "http://news.ycombinator.com/"
9
-
10
- @@MATCHER_ITEM = "//td[@class = 'title']/a"
11
-
12
- def name
13
- return "Hacker News"
14
- end
15
-
16
- def run()
17
- doc = Nokogiri::HTML(open(@@URL))
18
-
19
- res = doc.xpath(@@MATCHER_ITEM).map { |i|
20
- {
21
- :text => i.text,
22
- :link => URI.join(@@URL, i['href'])
23
- }
24
- }
25
-
26
- return {
27
- :plugin => self.class.name,
28
- :title => doc.title,
29
- :res => res
30
- }
31
- end
32
- end
33
- end # Plugins
34
- end # Crawler
35
- end # Apollo
data/main.rb DELETED
@@ -1,170 +0,0 @@
1
- #! /usr/bin/env ruby
2
-
3
- require "rubygems"
4
- require "bundler/setup"
5
-
6
- require 'json'
7
-
8
- require "thor"
9
-
10
- require "open-uri"
11
- require "nokogiri"
12
-
13
- require "pp"
14
- require "optparse"
15
-
16
- module Crawler
17
- class Program
18
- # This hash will hold all of the options
19
- # parsed from the command-line by
20
- # OptionParser.
21
- @options = nil
22
- @optparser = nil
23
- @plugins = nil
24
-
25
- # Initializer - Constructor
26
- def initialize
27
- @plugins = {}
28
- end
29
-
30
- # Initialize command-line options
31
- def init_options
32
- @options = {}
33
- @options[:verbose] = false
34
-
35
- @optparser = OptionParser.new do | opts |
36
- # This displays the help screen, all programs are
37
- # assumed to have this option.
38
- opts.on('-h', '--help', 'Display this screen') do
39
- puts opts
40
- exit
41
- end
42
-
43
- opts.on('-a', '--all', 'Run all plugins') do
44
- @options[:run_all] = true
45
- end
46
-
47
- opts.on('-v', '--verbose', 'Enable verbose output') do
48
- @options[:verbose] = true
49
- end
50
-
51
- opts.on('-l', '--list-plugins', 'List of plugins') do
52
- @options[:list_plugins] = true
53
- end
54
- end
55
- end
56
-
57
- # Parse the options passed to command-line
58
- def parse_options
59
- # Parse the command-line. Remember there are two forms
60
- # of the parse method. The 'parse' method simply parses
61
- # ARGV, while the 'parse!' method parses ARGV and removes
62
- # any options found there, as well as any parameters for
63
- # the options. What's left is the list of files to resize.
64
- @optparser.parse!
65
- end
66
-
67
- # Load global options first
68
- # Merge it with local options (if they exists)
69
- def load_config_file()
70
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
71
- puts "Inspecting #{config} ..."
72
- if(File.exists?(config))
73
- if(@options[:verbose])
74
- puts "Loading config '#{config}'"
75
- end
76
-
77
- puts "Let's require '#{@options[:verbose]}'"
78
- require config
79
- else
80
- if(@options[:verbose])
81
- # TODO: Add support for initial rake task generation
82
- # Something like this:
83
- # rake config:init # Initializes config files with
84
- # their defaults (if not exists already)
85
- puts "Default config does not exist, skipping - '#{config}'"
86
- end
87
- end
88
- end
89
-
90
- # Register plugins (specific crawlers)
91
- def register_plugins()
92
- dir = File.join(File.dirname(__FILE__), "lib","plugins")
93
- if(@options[:verbose])
94
- puts "Registering plugins - '#{dir}'"
95
- end
96
-
97
- sites = File.join(dir, "**", "*.rb")
98
- Dir.glob(sites).each do |site|
99
- require site
100
- end
101
-
102
- tmp = Apollo::Crawler::Plugins.constants.select { |c|
103
- Class === Apollo::Crawler::Plugins.const_get(c)
104
- }
105
-
106
- tmp.each do |x|
107
- klass = Object.const_get('Apollo').const_get('Crawler').const_get('Plugins').const_get(x)
108
- @plugins.merge!({ x.downcase.to_s => klass})
109
- end
110
-
111
- if(@options[:verbose])
112
- @plugins.each do |plugin, klass|
113
- name = klass.new.class.name
114
-
115
- if name == "Apollo::Crawler::Plugins::Plugin"
116
- next
117
- end
118
-
119
- puts "Registered '#{plugin}' -> '#{name}'"
120
- end
121
- end
122
- end
123
-
124
- def run
125
- init_options()
126
-
127
- load_config_file()
128
-
129
- parse_options()
130
-
131
- # Register sites which can be crawled
132
- register_plugins()
133
-
134
- if(@options[:list_plugins])
135
- puts "Listing plugins"
136
- puts "----------------------------------------"
137
- i = 0
138
- @plugins.sort.each do |plugin, klass|
139
- instance = klass.new
140
- # puts klass.class_eval("@@NAME")
141
- puts "(#{i}) #{plugin} - #{instance.name}"
142
- i += 1
143
- end
144
- puts "----------------------------------------"
145
- return
146
- end
147
-
148
- plugins = ARGV
149
-
150
- if(@options[:run_all])
151
- plugins = @plugins.keys
152
- end
153
-
154
- if(plugins.empty?)
155
- puts @optparser
156
- end
157
-
158
- plugins.each do |plugin|
159
- p = @plugins[plugin.downcase].new
160
-
161
- # puts "Running '#{plugin}'"
162
- puts JSON.pretty_generate(p.run)
163
- end
164
- end
165
- end
166
- end
167
-
168
- if __FILE__ == $0
169
- Crawler::Program.new.run()
170
- end