socialcrawler 0.0.0 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.travis.yml ADDED
@@ -0,0 +1,10 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - "1.9.3"
5
+ - jruby-19mode
6
+ - rbx
7
+
8
+ addons:
9
+ climate_control:
10
+ repo_token: c5d1031f512ff1ec104782e87ba1514dff71d8a564b928076e54bb67226552df
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in SocialCrawler.gemspec
4
4
  gemspec
5
5
 
6
- gem 'nokogiri', '~> 1.6.0'
6
+ gem 'nokogiri', '1.6.4.1'
7
+ # test coverage
8
+ gem 'coveralls', require: false, group: :test
9
+ gem "codeclimate-test-reporter", group: :test, require: nil
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
+ [![Coverage Status](https://img.shields.io/coveralls/iceraj/socialcrawler.svg)](https://coveralls.io/r/iceraj/socialcrawler)
2
+ [![Build Status](https://travis-ci.org/iceraj/socialcrawler.svg?branch=feature%2FIntial_Development)](https://travis-ci.org/iceraj/socialcrawler)
3
+ [![Code Climate](https://codeclimate.com/github/iceraj/socialcrawler/badges/gpa.svg)](https://codeclimate.com/github/iceraj/socialcrawler)
4
+
1
5
  # Socialcrawler
2
6
 
3
- TODO: Write a gem description
7
+ SocialCrawler looks for social media links for different sites
4
8
 
5
9
  ## Installation
6
10
 
@@ -24,7 +28,7 @@ TODO: Write usage instructions here
24
28
 
25
29
  ## Contributing
26
30
 
27
- 1. Fork it ( https://github.com/[my-github-username]/socialcrawler/fork )
31
+ 1. Fork it ( https://github.com/iceraj/socialcrawler/fork )
28
32
  2. Create your feature branch (`git checkout -b my-new-feature`)
29
33
  3. Commit your changes (`git commit -am 'Add some feature'`)
30
34
  4. Push to the branch (`git push origin my-new-feature`)
data/Rakefile CHANGED
@@ -1,2 +1,9 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
2
3
 
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ desc "Run tests"
9
+ task :default => :test
data/lib/socialcrawler.rb CHANGED
@@ -22,139 +22,155 @@ require 'logger'
22
22
 
23
23
  module SocialCrawler
24
24
 
25
+ class SocialCrawler
25
26
 
26
- def self._put( hash, symbol , value , log=nil)
27
- log = Logger.new(STDOUT) if log.nil?
28
- if not hash.has_key?( symbol)
29
- hash[symbol] = value
30
- else
31
- hash[symbol] = "#{hash[symbol]} #{value}"
32
- log.info( "Multiple values for #{symbol} value #{hash[symbol]}")
27
+ def initialize
28
+ @map = {
29
+ twitter: 'twitter.com/',
30
+ facebook: 'facebook.com/',
31
+ google_plus: 'plus.google.com/'
32
+ }
33
33
  end
34
- end
35
34
 
36
- def self.crawl_url(url,log=nil)
37
- log = Logger.new(STDOUT) if log.nil?
38
- log.info( "Crawling #{url}")
39
- result = Hash.new('NOT FOUND')
40
- begin
41
- page = Nokogiri::HTML(open(url))
42
- title = page.css('title')
43
- if not title.nil?
44
- result[:title] = title.text.strip
35
+ def _put(hash, symbol, value, log=nil)
36
+ log = Logger.new(STDOUT) if log.nil?
37
+ if not hash.has_key?(symbol)
38
+ hash[symbol] = value
39
+ else
40
+ hash[symbol] = "#{hash[symbol]} #{value}"
41
+ log.info("Multiple values for #{symbol} value #{hash[symbol]}")
45
42
  end
43
+ end
44
+
45
+ def page_to_result(page, result, log)
46
46
  links = page.css('a[href]')
47
47
  links.each do |link|
48
48
  link_url = link['href']
49
-
50
- if not link_url.index('twitter.com/').nil?
51
- log.info( "twitter #{link_url} for #{url}")
52
- _put(result,:twitter,link_url,log)
49
+ @map.each do |k, prefix|
50
+ if not link_url.index(prefix).nil?
51
+ _put(result, k, link_url, log)
52
+ end
53
53
  end
54
- if not link_url.index('facebook.com/').nil?
55
- log.info( "facebook #{link_url} for #{url}")
56
- _put(result,:facebook,link_url,log)
54
+ end
55
+ end
56
+
57
+ def crawl_url(url, log=nil)
58
+ log = Logger.new(STDOUT) if log.nil?
59
+ log.info("Crawling #{url}")
60
+ result = Hash.new('NOT FOUND')
61
+ begin
62
+ page = Nokogiri::HTML(open(url))
63
+ title = page.css('title')
64
+ if not title.nil?
65
+ result[:title] = title.text.strip
57
66
  end
58
- if not link_url.index('plus.google.com/').nil?
59
- log.info( "google_plus #{link_url} for #{url}")
60
- _put(result,:google_plus,link_url,log)
67
+ page_to_result(page, result, log)
68
+ result[:url] = url
69
+ result[:success] = true
70
+ result[:message] = ''
71
+ rescue Exception => e
72
+ result[:url] = url
73
+ result[:success] = false
74
+ result[:message] = "#{e}"
75
+ end
76
+ return result
77
+ end
78
+
79
+ def load_status_cache(status_filename, log=nil)
80
+ status = Hash.new
81
+ if not status_filename.nil? and File.exists?(status_filename)
82
+ log.info("Loading previous status from #{status_filename}")
83
+ CSV.foreach(status_filename) do |row|
84
+ if row.count < 3
85
+ next
86
+ end
87
+ url = row[0]
88
+ result = row[1]
89
+ message = row[2]
90
+ status[url] = {
91
+ :url => url,
92
+ :result => result,
93
+ :message => message
94
+ }
61
95
  end
96
+ log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
62
97
  end
63
- result[:url] = url
64
- result[:success] = true
65
- result[:message] = ''
66
- rescue Exception => e
67
- result[:url] = url
68
- result[:success] = false
69
- result[:message] = "#{e}"
98
+ return status
70
99
  end
71
- return result
72
- end
73
100
 
74
- def self.crawl( domain_list_filename, output_list_filename, status_filename=nil , log=nil)
75
- log = Logger.new(STDOUT) if log.nil?
76
- log.info( "Crawler started")
77
- status = Hash.new
78
- if not status_filename.nil? and File.exists?(status_filename)
79
- log.info( "Loading previous status from #{status_filename}")
80
- CSV.foreach( status_filename ) do |row|
81
- begin
101
+ def load_output_cache(output_list_filename, log=nil)
102
+ data = Hash.new()
103
+ log.info("Loading previous status from #{output_list_filename}")
104
+ if not File.exist?(output_list_filename)
105
+ return data
106
+ end
107
+ CSV.foreach(output_list_filename) do |row|
108
+ log.info("Loading #{row} #{row.count}")
109
+ if row.count < 5
110
+ next
111
+ end
82
112
  url = row[0]
83
- result = row[1]
84
- message = row[2]
85
- status[url] = {
113
+ title= row[1]
114
+ twitter = row[2]
115
+ facebook = row[3]
116
+ google_plus = row[4]
117
+ data[url] = {
86
118
  :url => url,
87
- :result => result,
88
- :message => message
119
+ :title => title,
120
+ :twitter => twitter,
121
+ :facebook => facebook,
122
+ :google_plus => google_plus
89
123
  }
90
- rescue Exception => e
91
- log.info("Exception reading file #{e}")
92
- end
124
+ log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
93
125
  end
94
- log.info( "Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
126
+ return data
95
127
  end
96
128
 
97
- data = Hash.new()
98
- if File.exist?(output_list_filename)
99
- log.info( "Loading previous status from #{output_list_filename}")
100
- CSV.open( output_list_filename ) do |row|
101
- if row.count >= 5
102
- url = row[0]
103
- title= row[1]
104
- twitter = row[2]
105
- facebook = row[3]
106
- google_plus = row[4]
107
- data[url] = {
108
- :url => url,
109
- :title => title,
110
- :twitter => twitter,
111
- :facebook => facebook,
112
- :google_plus => google_plus
113
- }
114
- end
129
+ def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
130
+ log = Logger.new(STDOUT) if log.nil?
131
+ log.info("Crawler started")
132
+
133
+ status = load_status_cache(status_filename, log)
134
+
135
+ data = load_output_cache(output_list_filename, log)
136
+
137
+ CSV.open(output_list_filename, "wb") do |output|
138
+ data.each do |k, v|
139
+ output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
140
+ end
141
+ CSV.open(status_filename, "wb") do |status_line|
142
+ status.each do |k, v|
143
+ status_line << [k, v[:success], v[:message]]
144
+ end
145
+ crawl_loop(data, domain_list_filename, log, output, status, status_line)
146
+ end
115
147
  end
116
- log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
117
148
  end
118
149
 
119
- CSV.foreach( domain_list_filename ) do |row|
150
+ def crawl_loop(data, domain_list_filename, log, output, status, status_line)
151
+ CSV.foreach(domain_list_filename) do |row|
120
152
  url = row[0]
121
153
  if status.has_key?(url)
122
- # already visited, skip
123
- else
124
- result = crawl_url(url,log)
125
- if result[:success] == true
126
- CSV.open( output_list_filename, "wb") do |output|
127
- data.each do |k,v|
128
- log.info(k)
129
- log.info(v)
130
- output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]]
131
- end
132
- output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
133
- data[url] = result
134
- end
135
- status[url] = {
136
- :url => url,
137
- :result => 'success',
138
- :message => ''
139
- }
140
- CSV.open( status_filename, "wb" ) do |status_line|
141
- status_line << [url,'success','']
142
- end
143
- else
144
- status[url] = {
145
- :url => url,
146
- :result => result[:success],
147
- :message => result[:message]
148
- }
149
- CSV.open( status_filename, "wb" ) do |status_line|
150
- status_line << [url,result[:success],result[:message]]
151
- end
154
+ next
155
+ end
156
+ result = crawl_url(url, log)
157
+ if result[:success] == true
158
+ data[url] = result
159
+ output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
152
160
  end
161
+ status[url] = {
162
+ :url => url,
163
+ :result => result[:success],
164
+ :message => result[:message]
165
+ }
166
+ status_line << [url, result[:success], result[:message]]
153
167
  end
154
168
  end
155
169
  end
156
170
  end
157
171
 
158
172
  if __FILE__ == $0
159
- SocialCrawler.crawl(ARGV[0],ARGV[1],ARGV[2])
173
+ #:nocov:
174
+ SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
175
+ #:nocov:
160
176
  end
@@ -1,3 +1,3 @@
1
1
  module SocialCrawler
2
- VERSION = "0.0.0"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,25 @@
1
+ require 'test_helper'
2
+ require "test/unit"
3
+ require 'semantic'
4
+ require 'socialcrawler'
5
+
6
+ class CrawlewrTest < Test::Unit::TestCase
7
+
8
+ def test_version
9
+ s = SocialCrawler::VERSION
10
+ v = Semantic::Version.new(s)
11
+ assert_equal v.to_s, s
12
+ end
13
+
14
+ def test_1
15
+ File.delete('/tmp/test_out.txt') if File.exists?('/tmp/test_out.txt')
16
+ File.delete('/tmp/test_status.txt') if File.exists?('/tmp/test_status.txt')
17
+
18
+ sc = SocialCrawler::SocialCrawler.new
19
+ sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
20
+
21
+ sc = SocialCrawler::SocialCrawler.new
22
+ sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
23
+ end
24
+
25
+ end
data/test/test_helper.rb CHANGED
@@ -1,2 +1,12 @@
1
+ require "codeclimate-test-reporter"
1
2
  require 'simplecov'
2
- SimpleCov.start
3
+ require 'coveralls'
4
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
5
+ SimpleCov::Formatter::HTMLFormatter,
6
+ Coveralls::SimpleCov::Formatter
7
+ ]
8
+ CodeClimate::TestReporter.start
9
+ SimpleCov.start
10
+ puts "Simple Coverage Started"
11
+
12
+
data/test/test_url.txt ADDED
@@ -0,0 +1,4 @@
1
+ https://twitter.com/bugaco
2
+ https://plus.google.com/101033631762132540828/posts
3
+ https://www.facebook.com/
4
+ https://localhost/
metadata CHANGED
@@ -1,83 +1,94 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socialcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.2
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Ivica Ceraj
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2015-01-12 00:00:00.000000000 Z
12
+ date: 2015-01-16 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: bundler
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - "~>"
19
+ - - ~>
18
20
  - !ruby/object:Gem::Version
19
21
  version: '1.7'
20
22
  type: :development
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
- - - "~>"
27
+ - - ~>
25
28
  - !ruby/object:Gem::Version
26
29
  version: '1.7'
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: rake
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
- - - "~>"
35
+ - - ~>
32
36
  - !ruby/object:Gem::Version
33
37
  version: '10.0'
34
38
  type: :development
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
- - - "~>"
43
+ - - ~>
39
44
  - !ruby/object:Gem::Version
40
45
  version: '10.0'
41
46
  - !ruby/object:Gem::Dependency
42
47
  name: semantic
43
48
  requirement: !ruby/object:Gem::Requirement
49
+ none: false
44
50
  requirements:
45
- - - "~>"
51
+ - - ~>
46
52
  - !ruby/object:Gem::Version
47
53
  version: '1.0'
48
54
  type: :development
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
51
58
  requirements:
52
- - - "~>"
59
+ - - ~>
53
60
  - !ruby/object:Gem::Version
54
61
  version: '1.0'
55
62
  - !ruby/object:Gem::Dependency
56
63
  name: simplecov
57
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
58
66
  requirements:
59
- - - "~>"
67
+ - - ~>
60
68
  - !ruby/object:Gem::Version
61
69
  version: '0.9'
62
70
  type: :development
63
71
  prerelease: false
64
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
65
74
  requirements:
66
- - - "~>"
75
+ - - ~>
67
76
  - !ruby/object:Gem::Version
68
77
  version: '0.9'
69
78
  - !ruby/object:Gem::Dependency
70
79
  name: simplecov-html
71
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
72
82
  requirements:
73
- - - "~>"
83
+ - - ~>
74
84
  - !ruby/object:Gem::Version
75
85
  version: '0.8'
76
86
  type: :development
77
87
  prerelease: false
78
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
79
90
  requirements:
80
- - - "~>"
91
+ - - ~>
81
92
  - !ruby/object:Gem::Version
82
93
  version: '0.8'
83
94
  description: It read file containing list of urls and produces output file with domain,
@@ -88,7 +99,9 @@ executables: []
88
99
  extensions: []
89
100
  extra_rdoc_files: []
90
101
  files:
91
- - ".gitignore"
102
+ - .coveralls.yml
103
+ - .gitignore
104
+ - .travis.yml
92
105
  - Gemfile
93
106
  - LICENSE.txt
94
107
  - README.md
@@ -96,32 +109,35 @@ files:
96
109
  - lib/socialcrawler.rb
97
110
  - lib/socialcrawler/version.rb
98
111
  - socialcrawler.gemspec
112
+ - test/test_crawler.rb
99
113
  - test/test_helper.rb
100
- - test/unit/semver_test.rb
114
+ - test/test_url.txt
101
115
  homepage: http://github.com/iceraj/socialcrawler
102
116
  licenses:
103
117
  - LGPL 2.1
104
- metadata: {}
105
118
  post_install_message:
106
119
  rdoc_options: []
107
120
  require_paths:
108
121
  - lib
109
122
  required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
110
124
  requirements:
111
- - - ">="
125
+ - - ! '>='
112
126
  - !ruby/object:Gem::Version
113
127
  version: '0'
114
128
  required_rubygems_version: !ruby/object:Gem::Requirement
129
+ none: false
115
130
  requirements:
116
- - - ">="
131
+ - - ! '>='
117
132
  - !ruby/object:Gem::Version
118
133
  version: '0'
119
134
  requirements: []
120
135
  rubyforge_project:
121
- rubygems_version: 2.4.5
136
+ rubygems_version: 1.8.24
122
137
  signing_key:
123
- specification_version: 4
138
+ specification_version: 3
124
139
  summary: SocialCrawler looks for social media links for different sites
125
140
  test_files:
141
+ - test/test_crawler.rb
126
142
  - test/test_helper.rb
127
- - test/unit/semver_test.rb
143
+ - test/test_url.txt
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: 1bfd2a9261b07fb456185b808c9f532128f0fcdc
4
- data.tar.gz: 42f27c1214c3b8e850a6b9106bc0002a0f76d862
5
- SHA512:
6
- metadata.gz: 21878df5ce6b2c75e3af24e4f2b950076098af255c88e900d639e3852eedb21587d485c83271df3b08bd2d6a2f363d2faf8e9052b3d497431f46b7f195433de3
7
- data.tar.gz: e59148b275a58ec7ad5b692c25f030e94ce2ed585b5b69010d39df19fb17b112a62c65204a59a8e2fafa23485881d67d9cc7027a82dbb00192664ed0056170fd
@@ -1,14 +0,0 @@
1
- require "test/unit"
2
- require 'semantic'
3
- require 'socialcrawler'
4
- require 'test_helper'
5
-
6
- class VersioningTest < Test::Unit::TestCase
7
-
8
- def test_version
9
- s = SocialCrawler::VERSION
10
- v = Semantic::Version.new(s)
11
- assert_equal v.to_s, s
12
- end
13
-
14
- end