cookler 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/cookler.rb +82 -0
- data/lib/cookler/utils.rb +46 -0
- metadata +50 -0
data/lib/cookler.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'mongo'
|
3
|
+
|
4
|
+
require 'cookler/utils'
|
5
|
+
|
6
|
+
class Cookler
|
7
|
+
|
8
|
+
def initialize(targets, max_results=nil)
|
9
|
+
@targets = targets
|
10
|
+
@max_results = (max_results.nil? ? 15 : max_results)
|
11
|
+
@conn = Mongo::Connection.new
|
12
|
+
@db = @conn['cookler-db']
|
13
|
+
@pages = @db['pages']
|
14
|
+
@stats = @db['stats']
|
15
|
+
@depth_limit = 3
|
16
|
+
@threads = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.analyze(targets, max_results=nil)
|
20
|
+
cookler = self.new(targets, max_results)
|
21
|
+
cookler.run
|
22
|
+
end
|
23
|
+
|
24
|
+
def run
|
25
|
+
|
26
|
+
total_words = []
|
27
|
+
|
28
|
+
@pages.remove
|
29
|
+
@stats.remove
|
30
|
+
|
31
|
+
general_stime = Time.now.to_f
|
32
|
+
@targets.each do |target|
|
33
|
+
@threads << Thread.new(target) do |current_target|
|
34
|
+
current_target_stime = Time.now.to_f
|
35
|
+
current_target_words = _get_words_on_pages current_target
|
36
|
+
total_words += current_target_words
|
37
|
+
current_target_wstats = Hash[current_target_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
|
38
|
+
@stats.insert({:type => current_target[0], :duration => (Time.now.to_f - current_target_stime).to_s, :wstats => current_target_wstats})
|
39
|
+
end
|
40
|
+
end
|
41
|
+
@threads.each { |aThread| aThread.join }
|
42
|
+
|
43
|
+
general_wstats = Hash[total_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
|
44
|
+
@stats.insert({:type => :general, :duration => (Time.now.to_f - general_stime).to_s, :wstats => general_wstats})
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def _get_words_on_pages(target)
|
50
|
+
|
51
|
+
nbResults = 0
|
52
|
+
words = []
|
53
|
+
|
54
|
+
anemone = Anemone::Core.new(target[1][0], {:depth_limit => @depth_limit})
|
55
|
+
anemone.storage = Anemone::Storage.MongoDB
|
56
|
+
anemone.focus_crawl do |page|
|
57
|
+
if nbResults == @max_results
|
58
|
+
anemone.kill_threads
|
59
|
+
return words
|
60
|
+
end
|
61
|
+
if target[1][3] == true
|
62
|
+
page.links.delete_if { |x| (x.to_s =~ target[1][1]).nil? }
|
63
|
+
end
|
64
|
+
page.links
|
65
|
+
end
|
66
|
+
anemone.on_pages_like(target[1][1]) do |page|
|
67
|
+
if not page.doc.nil?
|
68
|
+
page.doc.css('script, style, br').each { |x| x.replace ' ' }
|
69
|
+
title = page.doc.at('title').inner_html.gsub(/\r|\t|\n?/, "") rescue page.url.path
|
70
|
+
data = page.doc.at(target[1][2]).inner_text
|
71
|
+
@pages.insert({:type => target[0], :url => page.url.path, :title => title, :data => data})
|
72
|
+
words += data.removeaccents.to_s.gsub(/[0-9]+/, " ").scan(/[\w']+/)
|
73
|
+
nbResults += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
anemone.run
|
77
|
+
anemone.kill_threads
|
78
|
+
return words
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Monkey patch a new method kill_threads in Anemone::Core to kill all running threads:
|
2
|
+
module Anemone
|
3
|
+
class Core
|
4
|
+
def kill_threads
|
5
|
+
@tentacles.each { |thread| Thread.kill(thread) if thread.alive? }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Patch to remove the accents from the string. (Uses String::ACCENTS_MAPPING as the source map).
|
11
|
+
class String
|
12
|
+
|
13
|
+
ACCENTS_MAPPING = {
|
14
|
+
'E' => [200,201,202,203],
|
15
|
+
'e' => [232,233,234,235],
|
16
|
+
'A' => [192,193,194,195,196,197],
|
17
|
+
'a' => [224,225,226,227,228,229,230],
|
18
|
+
'C' => [199],
|
19
|
+
'c' => [231],
|
20
|
+
'O' => [210,211,212,213,214,216],
|
21
|
+
'o' => [242,243,244,245,246,248],
|
22
|
+
'I' => [204,205,206,207],
|
23
|
+
'i' => [236,237,238,239],
|
24
|
+
'U' => [217,218,219,220],
|
25
|
+
'u' => [249,250,251,252],
|
26
|
+
'N' => [209],
|
27
|
+
'n' => [241],
|
28
|
+
'Y' => [221],
|
29
|
+
'y' => [253,255],
|
30
|
+
'AE' => [306],
|
31
|
+
'ae' => [346],
|
32
|
+
'OE' => [188],
|
33
|
+
'oe' => [189]
|
34
|
+
}
|
35
|
+
|
36
|
+
def removeaccents
|
37
|
+
str = String.new(self)
|
38
|
+
String::ACCENTS_MAPPING.each {|letter,accents|
|
39
|
+
packed = accents.pack('U*')
|
40
|
+
rxp = Regexp.new("[#{packed}]", nil)
|
41
|
+
str.gsub!(rxp, letter)
|
42
|
+
}
|
43
|
+
|
44
|
+
str
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cookler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alexandre MOOTASSEM
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-19 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! " Cookler is a Ruby library that provides spidering and analysing
|
15
|
+
features.\n It helps you to quickly write a program to retrieve content and statistics
|
16
|
+
of any website.\n The API is based on Anemone (http://anemone.rubyforge.org)
|
17
|
+
and use MongoDB as storage solution.\n Cookler has a multi-threaded and easy-to-use
|
18
|
+
design. The retrieved data and generated stats are\n stored in 'cookler-db'.\n"
|
19
|
+
email: alexandre.mootassem@gmail.com
|
20
|
+
executables: []
|
21
|
+
extensions: []
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- lib/cookler.rb
|
25
|
+
- lib/cookler/utils.rb
|
26
|
+
homepage: https://github.com/gastounage/cookler
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.8
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Spidering and Analyzing Library
|
50
|
+
test_files: []
|