cookler 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/cookler.rb +82 -0
  2. data/lib/cookler/utils.rb +46 -0
  3. metadata +50 -0
@@ -0,0 +1,82 @@
1
+ require 'anemone'
2
+ require 'mongo'
3
+
4
+ require 'cookler/utils'
5
+
6
+ class Cookler
7
+
8
+ def initialize(targets, max_results=nil)
9
+ @targets = targets
10
+ @max_results = (max_results.nil? ? 15 : max_results)
11
+ @conn = Mongo::Connection.new
12
+ @db = @conn['cookler-db']
13
+ @pages = @db['pages']
14
+ @stats = @db['stats']
15
+ @depth_limit = 3
16
+ @threads = []
17
+ end
18
+
19
+ def self.analyze(targets, max_results=nil)
20
+ cookler = self.new(targets, max_results)
21
+ cookler.run
22
+ end
23
+
24
+ def run
25
+
26
+ total_words = []
27
+
28
+ @pages.remove
29
+ @stats.remove
30
+
31
+ general_stime = Time.now.to_f
32
+ @targets.each do |target|
33
+ @threads << Thread.new(target) do |current_target|
34
+ current_target_stime = Time.now.to_f
35
+ current_target_words = _get_words_on_pages current_target
36
+ total_words += current_target_words
37
+ current_target_wstats = Hash[current_target_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
38
+ @stats.insert({:type => current_target[0], :duration => (Time.now.to_f - current_target_stime).to_s, :wstats => current_target_wstats})
39
+ end
40
+ end
41
+ @threads.each { |aThread| aThread.join }
42
+
43
+ general_wstats = Hash[total_words.group_by{ |word| word.to_s.downcase}.map{ |word, instances| [word, instances.length] }.sort_by(&:last).reverse]
44
+ @stats.insert({:type => :general, :duration => (Time.now.to_f - general_stime).to_s, :wstats => general_wstats})
45
+ end
46
+
47
+ private
48
+
49
+ def _get_words_on_pages(target)
50
+
51
+ nbResults = 0
52
+ words = []
53
+
54
+ anemone = Anemone::Core.new(target[1][0], {:depth_limit => @depth_limit})
55
+ anemone.storage = Anemone::Storage.MongoDB
56
+ anemone.focus_crawl do |page|
57
+ if nbResults == @max_results
58
+ anemone.kill_threads
59
+ return words
60
+ end
61
+ if target[1][3] == true
62
+ page.links.delete_if { |x| (x.to_s =~ target[1][1]).nil? }
63
+ end
64
+ page.links
65
+ end
66
+ anemone.on_pages_like(target[1][1]) do |page|
67
+ if not page.doc.nil?
68
+ page.doc.css('script, style, br').each { |x| x.replace ' ' }
69
+ title = page.doc.at('title').inner_html.gsub(/\r|\t|\n?/, "") rescue page.url.path
70
+ data = page.doc.at(target[1][2]).inner_text
71
+ @pages.insert({:type => target[0], :url => page.url.path, :title => title, :data => data})
72
+ words += data.removeaccents.to_s.gsub(/[0-9]+/, " ").scan(/[\w']+/)
73
+ nbResults += 1
74
+ end
75
+ end
76
+ anemone.run
77
+ anemone.kill_threads
78
+ return words
79
+ end
80
+
81
+ end
82
+
@@ -0,0 +1,46 @@
1
+ # Monkey patch a new method kill_threads in Anemone::Core to kill all running threads:
2
+ module Anemone
3
+ class Core
4
+ def kill_threads
5
+ @tentacles.each { |thread| Thread.kill(thread) if thread.alive? }
6
+ end
7
+ end
8
+ end
9
+
10
+ # Patch to remove the accents from the string. (Uses String::ACCENTS_MAPPING as the source map).
11
+ class String
12
+
13
+ ACCENTS_MAPPING = {
14
+ 'E' => [200,201,202,203],
15
+ 'e' => [232,233,234,235],
16
+ 'A' => [192,193,194,195,196,197],
17
+ 'a' => [224,225,226,227,228,229,230],
18
+ 'C' => [199],
19
+ 'c' => [231],
20
+ 'O' => [210,211,212,213,214,216],
21
+ 'o' => [242,243,244,245,246,248],
22
+ 'I' => [204,205,206,207],
23
+ 'i' => [236,237,238,239],
24
+ 'U' => [217,218,219,220],
25
+ 'u' => [249,250,251,252],
26
+ 'N' => [209],
27
+ 'n' => [241],
28
+ 'Y' => [221],
29
+ 'y' => [253,255],
30
+ 'AE' => [306],
31
+ 'ae' => [346],
32
+ 'OE' => [188],
33
+ 'oe' => [189]
34
+ }
35
+
36
+ def removeaccents
37
+ str = String.new(self)
38
+ String::ACCENTS_MAPPING.each {|letter,accents|
39
+ packed = accents.pack('U*')
40
+ rxp = Regexp.new("[#{packed}]", nil)
41
+ str.gsub!(rxp, letter)
42
+ }
43
+
44
+ str
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cookler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alexandre MOOTASSEM
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-19 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: ! " Cookler is a Ruby library that provides spidering and analysing
15
+ features.\n It helps you to quickly write a program to retrieve content and statistics
16
+ of any website.\n The API is based on Anemone (http://anemone.rubyforge.org)
17
+ and use MongoDB as storage solution.\n Cookler has a multi-threaded and easy-to-use
18
+ design. The retrieved data and generated stats are\n stored in 'cookler-db'.\n"
19
+ email: alexandre.mootassem@gmail.com
20
+ executables: []
21
+ extensions: []
22
+ extra_rdoc_files: []
23
+ files:
24
+ - lib/cookler.rb
25
+ - lib/cookler/utils.rb
26
+ homepage: https://github.com/gastounage/cookler
27
+ licenses: []
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 1.8.8
47
+ signing_key:
48
+ specification_version: 3
49
+ summary: Spidering and Analyzing Library
50
+ test_files: []