socialcrawler 0.0.0 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.coveralls.yml ADDED
@@ -0,0 +1 @@
1
+ service_name: travis-ci
data/.travis.yml ADDED
@@ -0,0 +1,10 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - "1.9.3"
5
+ - jruby-19mode
6
+ - rbx
7
+
8
+ addons:
9
+ climate_control:
10
+ repo_token: c5d1031f512ff1ec104782e87ba1514dff71d8a564b928076e54bb67226552df
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in SocialCrawler.gemspec
4
4
  gemspec
5
5
 
6
- gem 'nokogiri', '~> 1.6.0'
6
+ gem 'nokogiri', '1.6.4.1'
7
+ # test coverage
8
+ gem 'coveralls', require: false, group: :test
9
+ gem "codeclimate-test-reporter", group: :test, require: nil
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
+ [![Coverage Status](https://img.shields.io/coveralls/iceraj/socialcrawler.svg)](https://coveralls.io/r/iceraj/socialcrawler)
2
+ [![Build Status](https://travis-ci.org/iceraj/socialcrawler.svg?branch=feature%2FIntial_Development)](https://travis-ci.org/iceraj/socialcrawler)
3
+ [![Code Climate](https://codeclimate.com/github/iceraj/socialcrawler/badges/gpa.svg)](https://codeclimate.com/github/iceraj/socialcrawler)
4
+
1
5
  # Socialcrawler
2
6
 
3
- TODO: Write a gem description
7
+ SocialCrawler looks for social media links for different sites
4
8
 
5
9
  ## Installation
6
10
 
@@ -24,7 +28,7 @@ TODO: Write usage instructions here
24
28
 
25
29
  ## Contributing
26
30
 
27
- 1. Fork it ( https://github.com/[my-github-username]/socialcrawler/fork )
31
+ 1. Fork it ( https://github.com/iceraj/socialcrawler/fork )
28
32
  2. Create your feature branch (`git checkout -b my-new-feature`)
29
33
  3. Commit your changes (`git commit -am 'Add some feature'`)
30
34
  4. Push to the branch (`git push origin my-new-feature`)
data/Rakefile CHANGED
@@ -1,2 +1,9 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
2
3
 
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ desc "Run tests"
9
+ task :default => :test
data/lib/socialcrawler.rb CHANGED
@@ -22,139 +22,155 @@ require 'logger'
22
22
 
23
23
  module SocialCrawler
24
24
 
25
+ class SocialCrawler
25
26
 
26
- def self._put( hash, symbol , value , log=nil)
27
- log = Logger.new(STDOUT) if log.nil?
28
- if not hash.has_key?( symbol)
29
- hash[symbol] = value
30
- else
31
- hash[symbol] = "#{hash[symbol]} #{value}"
32
- log.info( "Multiple values for #{symbol} value #{hash[symbol]}")
27
+ def initialize
28
+ @map = {
29
+ twitter: 'twitter.com/',
30
+ facebook: 'facebook.com/',
31
+ google_plus: 'plus.google.com/'
32
+ }
33
33
  end
34
- end
35
34
 
36
- def self.crawl_url(url,log=nil)
37
- log = Logger.new(STDOUT) if log.nil?
38
- log.info( "Crawling #{url}")
39
- result = Hash.new('NOT FOUND')
40
- begin
41
- page = Nokogiri::HTML(open(url))
42
- title = page.css('title')
43
- if not title.nil?
44
- result[:title] = title.text.strip
35
+ def _put(hash, symbol, value, log=nil)
36
+ log = Logger.new(STDOUT) if log.nil?
37
+ if not hash.has_key?(symbol)
38
+ hash[symbol] = value
39
+ else
40
+ hash[symbol] = "#{hash[symbol]} #{value}"
41
+ log.info("Multiple values for #{symbol} value #{hash[symbol]}")
45
42
  end
43
+ end
44
+
45
+ def page_to_result(page, result, log)
46
46
  links = page.css('a[href]')
47
47
  links.each do |link|
48
48
  link_url = link['href']
49
-
50
- if not link_url.index('twitter.com/').nil?
51
- log.info( "twitter #{link_url} for #{url}")
52
- _put(result,:twitter,link_url,log)
49
+ @map.each do |k, prefix|
50
+ if not link_url.index(prefix).nil?
51
+ _put(result, k, link_url, log)
52
+ end
53
53
  end
54
- if not link_url.index('facebook.com/').nil?
55
- log.info( "facebook #{link_url} for #{url}")
56
- _put(result,:facebook,link_url,log)
54
+ end
55
+ end
56
+
57
+ def crawl_url(url, log=nil)
58
+ log = Logger.new(STDOUT) if log.nil?
59
+ log.info("Crawling #{url}")
60
+ result = Hash.new('NOT FOUND')
61
+ begin
62
+ page = Nokogiri::HTML(open(url))
63
+ title = page.css('title')
64
+ if not title.nil?
65
+ result[:title] = title.text.strip
57
66
  end
58
- if not link_url.index('plus.google.com/').nil?
59
- log.info( "google_plus #{link_url} for #{url}")
60
- _put(result,:google_plus,link_url,log)
67
+ page_to_result(page, result, log)
68
+ result[:url] = url
69
+ result[:success] = true
70
+ result[:message] = ''
71
+ rescue Exception => e
72
+ result[:url] = url
73
+ result[:success] = false
74
+ result[:message] = "#{e}"
75
+ end
76
+ return result
77
+ end
78
+
79
+ def load_status_cache(status_filename, log=nil)
80
+ status = Hash.new
81
+ if not status_filename.nil? and File.exists?(status_filename)
82
+ log.info("Loading previous status from #{status_filename}")
83
+ CSV.foreach(status_filename) do |row|
84
+ if row.count < 3
85
+ next
86
+ end
87
+ url = row[0]
88
+ result = row[1]
89
+ message = row[2]
90
+ status[url] = {
91
+ :url => url,
92
+ :result => result,
93
+ :message => message
94
+ }
61
95
  end
96
+ log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
62
97
  end
63
- result[:url] = url
64
- result[:success] = true
65
- result[:message] = ''
66
- rescue Exception => e
67
- result[:url] = url
68
- result[:success] = false
69
- result[:message] = "#{e}"
98
+ return status
70
99
  end
71
- return result
72
- end
73
100
 
74
- def self.crawl( domain_list_filename, output_list_filename, status_filename=nil , log=nil)
75
- log = Logger.new(STDOUT) if log.nil?
76
- log.info( "Crawler started")
77
- status = Hash.new
78
- if not status_filename.nil? and File.exists?(status_filename)
79
- log.info( "Loading previous status from #{status_filename}")
80
- CSV.foreach( status_filename ) do |row|
81
- begin
101
+ def load_output_cache(output_list_filename, log=nil)
102
+ data = Hash.new()
103
+ log.info("Loading previous status from #{output_list_filename}")
104
+ if not File.exist?(output_list_filename)
105
+ return data
106
+ end
107
+ CSV.foreach(output_list_filename) do |row|
108
+ log.info("Loading #{row} #{row.count}")
109
+ if row.count < 5
110
+ next
111
+ end
82
112
  url = row[0]
83
- result = row[1]
84
- message = row[2]
85
- status[url] = {
113
+ title= row[1]
114
+ twitter = row[2]
115
+ facebook = row[3]
116
+ google_plus = row[4]
117
+ data[url] = {
86
118
  :url => url,
87
- :result => result,
88
- :message => message
119
+ :title => title,
120
+ :twitter => twitter,
121
+ :facebook => facebook,
122
+ :google_plus => google_plus
89
123
  }
90
- rescue Exception => e
91
- log.info("Exception reading file #{e}")
92
- end
124
+ log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
93
125
  end
94
- log.info( "Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
126
+ return data
95
127
  end
96
128
 
97
- data = Hash.new()
98
- if File.exist?(output_list_filename)
99
- log.info( "Loading previous status from #{output_list_filename}")
100
- CSV.open( output_list_filename ) do |row|
101
- if row.count >= 5
102
- url = row[0]
103
- title= row[1]
104
- twitter = row[2]
105
- facebook = row[3]
106
- google_plus = row[4]
107
- data[url] = {
108
- :url => url,
109
- :title => title,
110
- :twitter => twitter,
111
- :facebook => facebook,
112
- :google_plus => google_plus
113
- }
114
- end
129
+ def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
130
+ log = Logger.new(STDOUT) if log.nil?
131
+ log.info("Crawler started")
132
+
133
+ status = load_status_cache(status_filename, log)
134
+
135
+ data = load_output_cache(output_list_filename, log)
136
+
137
+ CSV.open(output_list_filename, "wb") do |output|
138
+ data.each do |k, v|
139
+ output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
140
+ end
141
+ CSV.open(status_filename, "wb") do |status_line|
142
+ status.each do |k, v|
143
+ status_line << [k, v[:success], v[:message]]
144
+ end
145
+ crawl_loop(data, domain_list_filename, log, output, status, status_line)
146
+ end
115
147
  end
116
- log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
117
148
  end
118
149
 
119
- CSV.foreach( domain_list_filename ) do |row|
150
+ def crawl_loop(data, domain_list_filename, log, output, status, status_line)
151
+ CSV.foreach(domain_list_filename) do |row|
120
152
  url = row[0]
121
153
  if status.has_key?(url)
122
- # already visited, skip
123
- else
124
- result = crawl_url(url,log)
125
- if result[:success] == true
126
- CSV.open( output_list_filename, "wb") do |output|
127
- data.each do |k,v|
128
- log.info(k)
129
- log.info(v)
130
- output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]]
131
- end
132
- output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
133
- data[url] = result
134
- end
135
- status[url] = {
136
- :url => url,
137
- :result => 'success',
138
- :message => ''
139
- }
140
- CSV.open( status_filename, "wb" ) do |status_line|
141
- status_line << [url,'success','']
142
- end
143
- else
144
- status[url] = {
145
- :url => url,
146
- :result => result[:success],
147
- :message => result[:message]
148
- }
149
- CSV.open( status_filename, "wb" ) do |status_line|
150
- status_line << [url,result[:success],result[:message]]
151
- end
154
+ next
155
+ end
156
+ result = crawl_url(url, log)
157
+ if result[:success] == true
158
+ data[url] = result
159
+ output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
152
160
  end
161
+ status[url] = {
162
+ :url => url,
163
+ :result => result[:success],
164
+ :message => result[:message]
165
+ }
166
+ status_line << [url, result[:success], result[:message]]
153
167
  end
154
168
  end
155
169
  end
156
170
  end
157
171
 
158
172
  if __FILE__ == $0
159
- SocialCrawler.crawl(ARGV[0],ARGV[1],ARGV[2])
173
+ #:nocov:
174
+ SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
175
+ #:nocov:
160
176
  end
@@ -1,3 +1,3 @@
1
1
  module SocialCrawler
2
- VERSION = "0.0.0"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,25 @@
1
+ require 'test_helper'
2
+ require "test/unit"
3
+ require 'semantic'
4
+ require 'socialcrawler'
5
+
6
+ class CrawlewrTest < Test::Unit::TestCase
7
+
8
+ def test_version
9
+ s = SocialCrawler::VERSION
10
+ v = Semantic::Version.new(s)
11
+ assert_equal v.to_s, s
12
+ end
13
+
14
+ def test_1
15
+ File.delete('/tmp/test_out.txt') if File.exists?('/tmp/test_out.txt')
16
+ File.delete('/tmp/test_status.txt') if File.exists?('/tmp/test_status.txt')
17
+
18
+ sc = SocialCrawler::SocialCrawler.new
19
+ sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
20
+
21
+ sc = SocialCrawler::SocialCrawler.new
22
+ sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
23
+ end
24
+
25
+ end
data/test/test_helper.rb CHANGED
@@ -1,2 +1,12 @@
1
+ require "codeclimate-test-reporter"
1
2
  require 'simplecov'
2
- SimpleCov.start
3
+ require 'coveralls'
4
+ SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
5
+ SimpleCov::Formatter::HTMLFormatter,
6
+ Coveralls::SimpleCov::Formatter
7
+ ]
8
+ CodeClimate::TestReporter.start
9
+ SimpleCov.start
10
+ puts "Simple Coverage Started"
11
+
12
+
data/test/test_url.txt ADDED
@@ -0,0 +1,4 @@
1
+ https://twitter.com/bugaco
2
+ https://plus.google.com/101033631762132540828/posts
3
+ https://www.facebook.com/
4
+ https://localhost/
metadata CHANGED
@@ -1,83 +1,94 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: socialcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.2
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Ivica Ceraj
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2015-01-12 00:00:00.000000000 Z
12
+ date: 2015-01-16 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: bundler
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - "~>"
19
+ - - ~>
18
20
  - !ruby/object:Gem::Version
19
21
  version: '1.7'
20
22
  type: :development
21
23
  prerelease: false
22
24
  version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
23
26
  requirements:
24
- - - "~>"
27
+ - - ~>
25
28
  - !ruby/object:Gem::Version
26
29
  version: '1.7'
27
30
  - !ruby/object:Gem::Dependency
28
31
  name: rake
29
32
  requirement: !ruby/object:Gem::Requirement
33
+ none: false
30
34
  requirements:
31
- - - "~>"
35
+ - - ~>
32
36
  - !ruby/object:Gem::Version
33
37
  version: '10.0'
34
38
  type: :development
35
39
  prerelease: false
36
40
  version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
37
42
  requirements:
38
- - - "~>"
43
+ - - ~>
39
44
  - !ruby/object:Gem::Version
40
45
  version: '10.0'
41
46
  - !ruby/object:Gem::Dependency
42
47
  name: semantic
43
48
  requirement: !ruby/object:Gem::Requirement
49
+ none: false
44
50
  requirements:
45
- - - "~>"
51
+ - - ~>
46
52
  - !ruby/object:Gem::Version
47
53
  version: '1.0'
48
54
  type: :development
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
51
58
  requirements:
52
- - - "~>"
59
+ - - ~>
53
60
  - !ruby/object:Gem::Version
54
61
  version: '1.0'
55
62
  - !ruby/object:Gem::Dependency
56
63
  name: simplecov
57
64
  requirement: !ruby/object:Gem::Requirement
65
+ none: false
58
66
  requirements:
59
- - - "~>"
67
+ - - ~>
60
68
  - !ruby/object:Gem::Version
61
69
  version: '0.9'
62
70
  type: :development
63
71
  prerelease: false
64
72
  version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
65
74
  requirements:
66
- - - "~>"
75
+ - - ~>
67
76
  - !ruby/object:Gem::Version
68
77
  version: '0.9'
69
78
  - !ruby/object:Gem::Dependency
70
79
  name: simplecov-html
71
80
  requirement: !ruby/object:Gem::Requirement
81
+ none: false
72
82
  requirements:
73
- - - "~>"
83
+ - - ~>
74
84
  - !ruby/object:Gem::Version
75
85
  version: '0.8'
76
86
  type: :development
77
87
  prerelease: false
78
88
  version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
79
90
  requirements:
80
- - - "~>"
91
+ - - ~>
81
92
  - !ruby/object:Gem::Version
82
93
  version: '0.8'
83
94
  description: It read file containing list of urls and produces output file with domain,
@@ -88,7 +99,9 @@ executables: []
88
99
  extensions: []
89
100
  extra_rdoc_files: []
90
101
  files:
91
- - ".gitignore"
102
+ - .coveralls.yml
103
+ - .gitignore
104
+ - .travis.yml
92
105
  - Gemfile
93
106
  - LICENSE.txt
94
107
  - README.md
@@ -96,32 +109,35 @@ files:
96
109
  - lib/socialcrawler.rb
97
110
  - lib/socialcrawler/version.rb
98
111
  - socialcrawler.gemspec
112
+ - test/test_crawler.rb
99
113
  - test/test_helper.rb
100
- - test/unit/semver_test.rb
114
+ - test/test_url.txt
101
115
  homepage: http://github.com/iceraj/socialcrawler
102
116
  licenses:
103
117
  - LGPL 2.1
104
- metadata: {}
105
118
  post_install_message:
106
119
  rdoc_options: []
107
120
  require_paths:
108
121
  - lib
109
122
  required_ruby_version: !ruby/object:Gem::Requirement
123
+ none: false
110
124
  requirements:
111
- - - ">="
125
+ - - ! '>='
112
126
  - !ruby/object:Gem::Version
113
127
  version: '0'
114
128
  required_rubygems_version: !ruby/object:Gem::Requirement
129
+ none: false
115
130
  requirements:
116
- - - ">="
131
+ - - ! '>='
117
132
  - !ruby/object:Gem::Version
118
133
  version: '0'
119
134
  requirements: []
120
135
  rubyforge_project:
121
- rubygems_version: 2.4.5
136
+ rubygems_version: 1.8.24
122
137
  signing_key:
123
- specification_version: 4
138
+ specification_version: 3
124
139
  summary: SocialCrawler looks for social media links for different sites
125
140
  test_files:
141
+ - test/test_crawler.rb
126
142
  - test/test_helper.rb
127
- - test/unit/semver_test.rb
143
+ - test/test_url.txt
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: 1bfd2a9261b07fb456185b808c9f532128f0fcdc
4
- data.tar.gz: 42f27c1214c3b8e850a6b9106bc0002a0f76d862
5
- SHA512:
6
- metadata.gz: 21878df5ce6b2c75e3af24e4f2b950076098af255c88e900d639e3852eedb21587d485c83271df3b08bd2d6a2f363d2faf8e9052b3d497431f46b7f195433de3
7
- data.tar.gz: e59148b275a58ec7ad5b692c25f030e94ce2ed585b5b69010d39df19fb17b112a62c65204a59a8e2fafa23485881d67d9cc7027a82dbb00192664ed0056170fd
@@ -1,14 +0,0 @@
1
- require "test/unit"
2
- require 'semantic'
3
- require 'socialcrawler'
4
- require 'test_helper'
5
-
6
- class VersioningTest < Test::Unit::TestCase
7
-
8
- def test_version
9
- s = SocialCrawler::VERSION
10
- v = Semantic::Version.new(s)
11
- assert_equal v.to_s, s
12
- end
13
-
14
- end