cookler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cookler.rb +82 -0
- data/lib/cookler/utils.rb +46 -0
- metadata +50 -0
data/lib/cookler.rb
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'mongo'
|
3
|
+
|
4
|
+
require 'cookler/utils'
|
5
|
+
|
6
|
+
class Cookler
|
7
|
+
|
8
|
+
def initialize(targets, max_results=nil)
|
9
|
+
@targets = targets
|
10
|
+
@max_results = (max_results.nil? ? 15 : max_results)
|
11
|
+
@conn = Mongo::Connection.new
|
12
|
+
@db = @conn['cookler-db']
|
13
|
+
@pages = @db['pages']
|
14
|
+
@stats = @db['stats']
|
15
|
+
@depth_limit = 3
|
16
|
+
@threads = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.analyze(targets, max_results=nil)
|
20
|
+
cookler = self.new(targets, max_results)
|
21
|
+
cookler.run
|
22
|
+
end
|
23
|
+
|
24
|
+
def run
|
25
|
+
|
26
|
+
total_words = []
|
27
|
+
|
28
|
+
@pages.remove
|
29
|
+
@stats.remove
|
30
|
+
|
31
|
+
general_stime = Time.now.to_f
|
32
|
+
@targets.each do |target|
|
33
|
+
@threads << Thread.new(target) do |current_target|
|
34
|
+
current_target_stime = Time.now.to_f
|
35
|
+
current_target_words = _get_words_on_pages current_target
|
36
|
+
total_words += current_target_words
|
37
|
+
current_target_wstats = Hash[current_target_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
|
38
|
+
@stats.insert({:type => current_target[0], :duration => (Time.now.to_f - current_target_stime).to_s, :wstats => current_target_wstats})
|
39
|
+
end
|
40
|
+
end
|
41
|
+
@threads.each { |aThread| aThread.join }
|
42
|
+
|
43
|
+
general_wstats = Hash[total_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
|
44
|
+
@stats.insert({:type => :general, :duration => (Time.now.to_f - general_stime).to_s, :wstats => general_wstats})
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def _get_words_on_pages(target)
|
50
|
+
|
51
|
+
nbResults = 0
|
52
|
+
words = []
|
53
|
+
|
54
|
+
anemone = Anemone::Core.new(target[1][0], {:depth_limit => @depth_limit})
|
55
|
+
anemone.storage = Anemone::Storage.MongoDB
|
56
|
+
anemone.focus_crawl do |page|
|
57
|
+
if nbResults == @max_results
|
58
|
+
anemone.kill_threads
|
59
|
+
return words
|
60
|
+
end
|
61
|
+
if target[1][3] == true
|
62
|
+
page.links.delete_if { |x| (x.to_s =~ target[1][1]).nil? }
|
63
|
+
end
|
64
|
+
page.links
|
65
|
+
end
|
66
|
+
anemone.on_pages_like(target[1][1]) do |page|
|
67
|
+
if not page.doc.nil?
|
68
|
+
page.doc.css('script, style, br').each { |x| x.replace ' ' }
|
69
|
+
title = page.doc.at('title').inner_html.gsub(/\r|\t|\n?/, "") rescue page.url.path
|
70
|
+
data = page.doc.at(target[1][2]).inner_text
|
71
|
+
@pages.insert({:type => target[0], :url => page.url.path, :title => title, :data => data})
|
72
|
+
words += data.removeaccents.to_s.gsub(/[0-9]+/, " ").scan(/[\w']+/)
|
73
|
+
nbResults += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
anemone.run
|
77
|
+
anemone.kill_threads
|
78
|
+
return words
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Monkey patch a new method kill_threads in Anemone::Core to kill all running threads:
|
2
|
+
module Anemone
|
3
|
+
class Core
|
4
|
+
def kill_threads
|
5
|
+
@tentacles.each { |thread| Thread.kill(thread) if thread.alive? }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Patch to remove the accents from the string. (Uses String::ACCENTS_MAPPING as the source map).
|
11
|
+
class String
|
12
|
+
|
13
|
+
ACCENTS_MAPPING = {
|
14
|
+
'E' => [200,201,202,203],
|
15
|
+
'e' => [232,233,234,235],
|
16
|
+
'A' => [192,193,194,195,196,197],
|
17
|
+
'a' => [224,225,226,227,228,229,230],
|
18
|
+
'C' => [199],
|
19
|
+
'c' => [231],
|
20
|
+
'O' => [210,211,212,213,214,216],
|
21
|
+
'o' => [242,243,244,245,246,248],
|
22
|
+
'I' => [204,205,206,207],
|
23
|
+
'i' => [236,237,238,239],
|
24
|
+
'U' => [217,218,219,220],
|
25
|
+
'u' => [249,250,251,252],
|
26
|
+
'N' => [209],
|
27
|
+
'n' => [241],
|
28
|
+
'Y' => [221],
|
29
|
+
'y' => [253,255],
|
30
|
+
'AE' => [306],
|
31
|
+
'ae' => [346],
|
32
|
+
'OE' => [188],
|
33
|
+
'oe' => [189]
|
34
|
+
}
|
35
|
+
|
36
|
+
def removeaccents
|
37
|
+
str = String.new(self)
|
38
|
+
String::ACCENTS_MAPPING.each {|letter,accents|
|
39
|
+
packed = accents.pack('U*')
|
40
|
+
rxp = Regexp.new("[#{packed}]", nil)
|
41
|
+
str.gsub!(rxp, letter)
|
42
|
+
}
|
43
|
+
|
44
|
+
str
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cookler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alexandre MOOTASSEM
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-08-19 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: ! " Cookler is a Ruby library that provides spidering and analysing
|
15
|
+
features.\n It helps you to quickly write a program to retrieve content and statistics
|
16
|
+
of any website.\n The API is based on Anemone (http://anemone.rubyforge.org)
|
17
|
+
and use MongoDB as storage solution.\n Cookler has a multi-threaded and easy-to-use
|
18
|
+
design. The retrieved data and generated stats are\n stored in 'cookler-db'.\n"
|
19
|
+
email: alexandre.mootassem@gmail.com
|
20
|
+
executables: []
|
21
|
+
extensions: []
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- lib/cookler.rb
|
25
|
+
- lib/cookler/utils.rb
|
26
|
+
homepage: https://github.com/gastounage/cookler
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.8
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Spidering and Analyzing Library
|
50
|
+
test_files: []
|