cookler 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/cookler.rb +82 -0
  2. data/lib/cookler/utils.rb +46 -0
  3. metadata +50 -0
@@ -0,0 +1,82 @@
1
+ require 'anemone'
2
+ require 'mongo'
3
+
4
+ require 'cookler/utils'
5
+
6
+ class Cookler
7
+
8
+ def initialize(targets, max_results=nil)
9
+ @targets = targets
10
+ @max_results = (max_results.nil? ? 15 : max_results)
11
+ @conn = Mongo::Connection.new
12
+ @db = @conn['cookler-db']
13
+ @pages = @db['pages']
14
+ @stats = @db['stats']
15
+ @depth_limit = 3
16
+ @threads = []
17
+ end
18
+
19
+ def self.analyze(targets, max_results=nil)
20
+ cookler = self.new(targets, max_results)
21
+ cookler.run
22
+ end
23
+
24
+ def run
25
+
26
+ total_words = []
27
+
28
+ @pages.remove
29
+ @stats.remove
30
+
31
+ general_stime = Time.now.to_f
32
+ @targets.each do |target|
33
+ @threads << Thread.new(target) do |current_target|
34
+ current_target_stime = Time.now.to_f
35
+ current_target_words = _get_words_on_pages current_target
36
+ total_words += current_target_words
37
+ current_target_wstats = Hash[current_target_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
38
+ @stats.insert({:type => current_target[0], :duration => (Time.now.to_f - current_target_stime).to_s, :wstats => current_target_wstats})
39
+ end
40
+ end
41
+ @threads.each { |aThread| aThread.join }
42
+
43
+ general_wstats = Hash[total_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
44
+ @stats.insert({:type => :general, :duration => (Time.now.to_f - general_stime).to_s, :wstats => general_wstats})
45
+ end
46
+
47
+ private
48
+
49
+ def _get_words_on_pages(target)
50
+
51
+ nbResults = 0
52
+ words = []
53
+
54
+ anemone = Anemone::Core.new(target[1][0], {:depth_limit => @depth_limit})
55
+ anemone.storage = Anemone::Storage.MongoDB
56
+ anemone.focus_crawl do |page|
57
+ if nbResults == @max_results
58
+ anemone.kill_threads
59
+ return words
60
+ end
61
+ if target[1][3] == true
62
+ page.links.delete_if { |x| (x.to_s =~ target[1][1]).nil? }
63
+ end
64
+ page.links
65
+ end
66
+ anemone.on_pages_like(target[1][1]) do |page|
67
+ if not page.doc.nil?
68
+ page.doc.css('script, style, br').each { |x| x.replace ' ' }
69
+ title = page.doc.at('title').inner_html.gsub(/\r|\t|\n?/, "") rescue page.url.path
70
+ data = page.doc.at(target[1][2]).inner_text
71
+ @pages.insert({:type => target[0], :url => page.url.path, :title => title, :data => data})
72
+ words += data.removeaccents.to_s.gsub(/[0-9]+/, " ").scan(/[\w']+/)
73
+ nbResults += 1
74
+ end
75
+ end
76
+ anemone.run
77
+ anemone.kill_threads
78
+ return words
79
+ end
80
+
81
+ end
82
+
@@ -0,0 +1,46 @@
1
+ # Monkey patch a new method kill_threads in Anemone::Core to kill all running threads:
2
+ module Anemone
3
+ class Core
4
+ def kill_threads
5
+ @tentacles.each { |thread| Thread.kill(thread) if thread.alive? }
6
+ end
7
+ end
8
+ end
9
+
10
+ # Patch to remove the accents from the string. (Uses String::ACCENTS_MAPPING as the source map).
11
+ class String
12
+
13
+ ACCENTS_MAPPING = {
14
+ 'E' => [200,201,202,203],
15
+ 'e' => [232,233,234,235],
16
+ 'A' => [192,193,194,195,196,197],
17
+ 'a' => [224,225,226,227,228,229,230],
18
+ 'C' => [199],
19
+ 'c' => [231],
20
+ 'O' => [210,211,212,213,214,216],
21
+ 'o' => [242,243,244,245,246,248],
22
+ 'I' => [204,205,206,207],
23
+ 'i' => [236,237,238,239],
24
+ 'U' => [217,218,219,220],
25
+ 'u' => [249,250,251,252],
26
+ 'N' => [209],
27
+ 'n' => [241],
28
+ 'Y' => [221],
29
+ 'y' => [253,255],
30
+ 'AE' => [306],
31
+ 'ae' => [346],
32
+ 'OE' => [188],
33
+ 'oe' => [189]
34
+ }
35
+
36
+ def removeaccents
37
+ str = String.new(self)
38
+ String::ACCENTS_MAPPING.each {|letter,accents|
39
+ packed = accents.pack('U*')
40
+ rxp = Regexp.new("[#{packed}]", nil)
41
+ str.gsub!(rxp, letter)
42
+ }
43
+
44
+ str
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cookler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alexandre MOOTASSEM
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-19 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: ! " Cookler is a Ruby library that provides spidering and analysing
15
+ features.\n It helps you to quickly write a program to retrieve content and statistics
16
+ of any website.\n The API is based on Anemone (http://anemone.rubyforge.org)
17
+ and use MongoDB as storage solution.\n Cookler has a multi-threaded and easy-to-use
18
+ design. The retrieved data and generated stats are\n stored in 'cookler-db'.\n"
19
+ email: alexandre.mootassem@gmail.com
20
+ executables: []
21
+ extensions: []
22
+ extra_rdoc_files: []
23
+ files:
24
+ - lib/cookler.rb
25
+ - lib/cookler/utils.rb
26
+ homepage: https://github.com/gastounage/cookler
27
+ licenses: []
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 1.8.8
47
+ signing_key:
48
+ specification_version: 3
49
+ summary: Spidering and Analyzing Library
50
+ test_files: []