socialcrawler 0.0.0 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.coveralls.yml +1 -0
- data/.travis.yml +10 -0
- data/Gemfile +4 -1
- data/README.md +6 -2
- data/Rakefile +7 -0
- data/lib/socialcrawler.rb +120 -104
- data/lib/socialcrawler/version.rb +1 -1
- data/test/test_crawler.rb +25 -0
- data/test/test_helper.rb +11 -1
- data/test/test_url.txt +4 -0
- metadata +36 -20
- checksums.yaml +0 -7
- data/test/unit/semver_test.rb +0 -14
data/.coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: travis-ci
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -3,4 +3,7 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in SocialCrawler.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
gem 'nokogiri', '
|
6
|
+
gem 'nokogiri', '1.6.4.1'
|
7
|
+
# test coverage
|
8
|
+
gem 'coveralls', require: false, group: :test
|
9
|
+
gem "codeclimate-test-reporter", group: :test, require: nil
|
data/README.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
[![Coverage Status](https://img.shields.io/coveralls/iceraj/socialcrawler.svg)](https://coveralls.io/r/iceraj/socialcrawler)
|
2
|
+
[![Build Status](https://travis-ci.org/iceraj/socialcrawler.svg?branch=feature%2FIntial_Development)](https://travis-ci.org/iceraj/socialcrawler)
|
3
|
+
[![Code Climate](https://codeclimate.com/github/iceraj/socialcrawler/badges/gpa.svg)](https://codeclimate.com/github/iceraj/socialcrawler)
|
4
|
+
|
1
5
|
# Socialcrawler
|
2
6
|
|
3
|
-
|
7
|
+
SocialCrawler looks for social media links for different sites
|
4
8
|
|
5
9
|
## Installation
|
6
10
|
|
@@ -24,7 +28,7 @@ TODO: Write usage instructions here
|
|
24
28
|
|
25
29
|
## Contributing
|
26
30
|
|
27
|
-
1. Fork it ( https://github.com/
|
31
|
+
1. Fork it ( https://github.com/iceraj/socialcrawler/fork )
|
28
32
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
33
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
34
|
4. Push to the branch (`git push origin my-new-feature`)
|
data/Rakefile
CHANGED
data/lib/socialcrawler.rb
CHANGED
@@ -22,139 +22,155 @@ require 'logger'
|
|
22
22
|
|
23
23
|
module SocialCrawler
|
24
24
|
|
25
|
+
class SocialCrawler
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
log.info( "Multiple values for #{symbol} value #{hash[symbol]}")
|
27
|
+
def initialize
|
28
|
+
@map = {
|
29
|
+
twitter: 'twitter.com/',
|
30
|
+
facebook: 'facebook.com/',
|
31
|
+
google_plus: 'plus.google.com/'
|
32
|
+
}
|
33
33
|
end
|
34
|
-
end
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
if not title.nil?
|
44
|
-
result[:title] = title.text.strip
|
35
|
+
def _put(hash, symbol, value, log=nil)
|
36
|
+
log = Logger.new(STDOUT) if log.nil?
|
37
|
+
if not hash.has_key?(symbol)
|
38
|
+
hash[symbol] = value
|
39
|
+
else
|
40
|
+
hash[symbol] = "#{hash[symbol]} #{value}"
|
41
|
+
log.info("Multiple values for #{symbol} value #{hash[symbol]}")
|
45
42
|
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def page_to_result(page, result, log)
|
46
46
|
links = page.css('a[href]')
|
47
47
|
links.each do |link|
|
48
48
|
link_url = link['href']
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
@map.each do |k, prefix|
|
50
|
+
if not link_url.index(prefix).nil?
|
51
|
+
_put(result, k, link_url, log)
|
52
|
+
end
|
53
53
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def crawl_url(url, log=nil)
|
58
|
+
log = Logger.new(STDOUT) if log.nil?
|
59
|
+
log.info("Crawling #{url}")
|
60
|
+
result = Hash.new('NOT FOUND')
|
61
|
+
begin
|
62
|
+
page = Nokogiri::HTML(open(url))
|
63
|
+
title = page.css('title')
|
64
|
+
if not title.nil?
|
65
|
+
result[:title] = title.text.strip
|
57
66
|
end
|
58
|
-
|
59
|
-
|
60
|
-
|
67
|
+
page_to_result(page, result, log)
|
68
|
+
result[:url] = url
|
69
|
+
result[:success] = true
|
70
|
+
result[:message] = ''
|
71
|
+
rescue Exception => e
|
72
|
+
result[:url] = url
|
73
|
+
result[:success] = false
|
74
|
+
result[:message] = "#{e}"
|
75
|
+
end
|
76
|
+
return result
|
77
|
+
end
|
78
|
+
|
79
|
+
def load_status_cache(status_filename, log=nil)
|
80
|
+
status = Hash.new
|
81
|
+
if not status_filename.nil? and File.exists?(status_filename)
|
82
|
+
log.info("Loading previous status from #{status_filename}")
|
83
|
+
CSV.foreach(status_filename) do |row|
|
84
|
+
if row.count < 3
|
85
|
+
next
|
86
|
+
end
|
87
|
+
url = row[0]
|
88
|
+
result = row[1]
|
89
|
+
message = row[2]
|
90
|
+
status[url] = {
|
91
|
+
:url => url,
|
92
|
+
:result => result,
|
93
|
+
:message => message
|
94
|
+
}
|
61
95
|
end
|
96
|
+
log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
|
62
97
|
end
|
63
|
-
|
64
|
-
result[:success] = true
|
65
|
-
result[:message] = ''
|
66
|
-
rescue Exception => e
|
67
|
-
result[:url] = url
|
68
|
-
result[:success] = false
|
69
|
-
result[:message] = "#{e}"
|
98
|
+
return status
|
70
99
|
end
|
71
|
-
return result
|
72
|
-
end
|
73
100
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
CSV.foreach(
|
81
|
-
|
101
|
+
def load_output_cache(output_list_filename, log=nil)
|
102
|
+
data = Hash.new()
|
103
|
+
log.info("Loading previous status from #{output_list_filename}")
|
104
|
+
if not File.exist?(output_list_filename)
|
105
|
+
return data
|
106
|
+
end
|
107
|
+
CSV.foreach(output_list_filename) do |row|
|
108
|
+
log.info("Loading #{row} #{row.count}")
|
109
|
+
if row.count < 5
|
110
|
+
next
|
111
|
+
end
|
82
112
|
url = row[0]
|
83
|
-
|
84
|
-
|
85
|
-
|
113
|
+
title= row[1]
|
114
|
+
twitter = row[2]
|
115
|
+
facebook = row[3]
|
116
|
+
google_plus = row[4]
|
117
|
+
data[url] = {
|
86
118
|
:url => url,
|
87
|
-
:
|
88
|
-
:
|
119
|
+
:title => title,
|
120
|
+
:twitter => twitter,
|
121
|
+
:facebook => facebook,
|
122
|
+
:google_plus => google_plus
|
89
123
|
}
|
90
|
-
|
91
|
-
log.info("Exception reading file #{e}")
|
92
|
-
end
|
124
|
+
log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
93
125
|
end
|
94
|
-
|
126
|
+
return data
|
95
127
|
end
|
96
128
|
|
97
|
-
|
98
|
-
|
99
|
-
log.info(
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
129
|
+
def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
|
130
|
+
log = Logger.new(STDOUT) if log.nil?
|
131
|
+
log.info("Crawler started")
|
132
|
+
|
133
|
+
status = load_status_cache(status_filename, log)
|
134
|
+
|
135
|
+
data = load_output_cache(output_list_filename, log)
|
136
|
+
|
137
|
+
CSV.open(output_list_filename, "wb") do |output|
|
138
|
+
data.each do |k, v|
|
139
|
+
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
140
|
+
end
|
141
|
+
CSV.open(status_filename, "wb") do |status_line|
|
142
|
+
status.each do |k, v|
|
143
|
+
status_line << [k, v[:success], v[:message]]
|
144
|
+
end
|
145
|
+
crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
146
|
+
end
|
115
147
|
end
|
116
|
-
log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
117
148
|
end
|
118
149
|
|
119
|
-
|
150
|
+
def crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
151
|
+
CSV.foreach(domain_list_filename) do |row|
|
120
152
|
url = row[0]
|
121
153
|
if status.has_key?(url)
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
log.info(k)
|
129
|
-
log.info(v)
|
130
|
-
output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]]
|
131
|
-
end
|
132
|
-
output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
|
133
|
-
data[url] = result
|
134
|
-
end
|
135
|
-
status[url] = {
|
136
|
-
:url => url,
|
137
|
-
:result => 'success',
|
138
|
-
:message => ''
|
139
|
-
}
|
140
|
-
CSV.open( status_filename, "wb" ) do |status_line|
|
141
|
-
status_line << [url,'success','']
|
142
|
-
end
|
143
|
-
else
|
144
|
-
status[url] = {
|
145
|
-
:url => url,
|
146
|
-
:result => result[:success],
|
147
|
-
:message => result[:message]
|
148
|
-
}
|
149
|
-
CSV.open( status_filename, "wb" ) do |status_line|
|
150
|
-
status_line << [url,result[:success],result[:message]]
|
151
|
-
end
|
154
|
+
next
|
155
|
+
end
|
156
|
+
result = crawl_url(url, log)
|
157
|
+
if result[:success] == true
|
158
|
+
data[url] = result
|
159
|
+
output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
|
152
160
|
end
|
161
|
+
status[url] = {
|
162
|
+
:url => url,
|
163
|
+
:result => result[:success],
|
164
|
+
:message => result[:message]
|
165
|
+
}
|
166
|
+
status_line << [url, result[:success], result[:message]]
|
153
167
|
end
|
154
168
|
end
|
155
169
|
end
|
156
170
|
end
|
157
171
|
|
158
172
|
if __FILE__ == $0
|
159
|
-
|
173
|
+
#:nocov:
|
174
|
+
SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
|
175
|
+
#:nocov:
|
160
176
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require "test/unit"
|
3
|
+
require 'semantic'
|
4
|
+
require 'socialcrawler'
|
5
|
+
|
6
|
+
class CrawlewrTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_version
|
9
|
+
s = SocialCrawler::VERSION
|
10
|
+
v = Semantic::Version.new(s)
|
11
|
+
assert_equal v.to_s, s
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_1
|
15
|
+
File.delete('/tmp/test_out.txt') if File.exists?('/tmp/test_out.txt')
|
16
|
+
File.delete('/tmp/test_status.txt') if File.exists?('/tmp/test_status.txt')
|
17
|
+
|
18
|
+
sc = SocialCrawler::SocialCrawler.new
|
19
|
+
sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
|
20
|
+
|
21
|
+
sc = SocialCrawler::SocialCrawler.new
|
22
|
+
sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
data/test/test_helper.rb
CHANGED
@@ -1,2 +1,12 @@
|
|
1
|
+
require "codeclimate-test-reporter"
|
1
2
|
require 'simplecov'
|
2
|
-
|
3
|
+
require 'coveralls'
|
4
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
5
|
+
SimpleCov::Formatter::HTMLFormatter,
|
6
|
+
Coveralls::SimpleCov::Formatter
|
7
|
+
]
|
8
|
+
CodeClimate::TestReporter.start
|
9
|
+
SimpleCov.start
|
10
|
+
puts "Simple Coverage Started"
|
11
|
+
|
12
|
+
|
data/test/test_url.txt
ADDED
metadata
CHANGED
@@ -1,83 +1,94 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socialcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Ivica Ceraj
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
12
|
+
date: 2015-01-16 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: bundler
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
|
-
- -
|
19
|
+
- - ~>
|
18
20
|
- !ruby/object:Gem::Version
|
19
21
|
version: '1.7'
|
20
22
|
type: :development
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - ~>
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '1.7'
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: rake
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- -
|
35
|
+
- - ~>
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '10.0'
|
34
38
|
type: :development
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- -
|
43
|
+
- - ~>
|
39
44
|
- !ruby/object:Gem::Version
|
40
45
|
version: '10.0'
|
41
46
|
- !ruby/object:Gem::Dependency
|
42
47
|
name: semantic
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
44
50
|
requirements:
|
45
|
-
- -
|
51
|
+
- - ~>
|
46
52
|
- !ruby/object:Gem::Version
|
47
53
|
version: '1.0'
|
48
54
|
type: :development
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
51
58
|
requirements:
|
52
|
-
- -
|
59
|
+
- - ~>
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: '1.0'
|
55
62
|
- !ruby/object:Gem::Dependency
|
56
63
|
name: simplecov
|
57
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
58
66
|
requirements:
|
59
|
-
- -
|
67
|
+
- - ~>
|
60
68
|
- !ruby/object:Gem::Version
|
61
69
|
version: '0.9'
|
62
70
|
type: :development
|
63
71
|
prerelease: false
|
64
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
65
74
|
requirements:
|
66
|
-
- -
|
75
|
+
- - ~>
|
67
76
|
- !ruby/object:Gem::Version
|
68
77
|
version: '0.9'
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: simplecov-html
|
71
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
72
82
|
requirements:
|
73
|
-
- -
|
83
|
+
- - ~>
|
74
84
|
- !ruby/object:Gem::Version
|
75
85
|
version: '0.8'
|
76
86
|
type: :development
|
77
87
|
prerelease: false
|
78
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
79
90
|
requirements:
|
80
|
-
- -
|
91
|
+
- - ~>
|
81
92
|
- !ruby/object:Gem::Version
|
82
93
|
version: '0.8'
|
83
94
|
description: It read file containing list of urls and produces output file with domain,
|
@@ -88,7 +99,9 @@ executables: []
|
|
88
99
|
extensions: []
|
89
100
|
extra_rdoc_files: []
|
90
101
|
files:
|
91
|
-
-
|
102
|
+
- .coveralls.yml
|
103
|
+
- .gitignore
|
104
|
+
- .travis.yml
|
92
105
|
- Gemfile
|
93
106
|
- LICENSE.txt
|
94
107
|
- README.md
|
@@ -96,32 +109,35 @@ files:
|
|
96
109
|
- lib/socialcrawler.rb
|
97
110
|
- lib/socialcrawler/version.rb
|
98
111
|
- socialcrawler.gemspec
|
112
|
+
- test/test_crawler.rb
|
99
113
|
- test/test_helper.rb
|
100
|
-
- test/
|
114
|
+
- test/test_url.txt
|
101
115
|
homepage: http://github.com/iceraj/socialcrawler
|
102
116
|
licenses:
|
103
117
|
- LGPL 2.1
|
104
|
-
metadata: {}
|
105
118
|
post_install_message:
|
106
119
|
rdoc_options: []
|
107
120
|
require_paths:
|
108
121
|
- lib
|
109
122
|
required_ruby_version: !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
110
124
|
requirements:
|
111
|
-
- -
|
125
|
+
- - ! '>='
|
112
126
|
- !ruby/object:Gem::Version
|
113
127
|
version: '0'
|
114
128
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
115
130
|
requirements:
|
116
|
-
- -
|
131
|
+
- - ! '>='
|
117
132
|
- !ruby/object:Gem::Version
|
118
133
|
version: '0'
|
119
134
|
requirements: []
|
120
135
|
rubyforge_project:
|
121
|
-
rubygems_version:
|
136
|
+
rubygems_version: 1.8.24
|
122
137
|
signing_key:
|
123
|
-
specification_version:
|
138
|
+
specification_version: 3
|
124
139
|
summary: SocialCrawler looks for social media links for different sites
|
125
140
|
test_files:
|
141
|
+
- test/test_crawler.rb
|
126
142
|
- test/test_helper.rb
|
127
|
-
- test/
|
143
|
+
- test/test_url.txt
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: 1bfd2a9261b07fb456185b808c9f532128f0fcdc
|
4
|
-
data.tar.gz: 42f27c1214c3b8e850a6b9106bc0002a0f76d862
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 21878df5ce6b2c75e3af24e4f2b950076098af255c88e900d639e3852eedb21587d485c83271df3b08bd2d6a2f363d2faf8e9052b3d497431f46b7f195433de3
|
7
|
-
data.tar.gz: e59148b275a58ec7ad5b692c25f030e94ce2ed585b5b69010d39df19fb17b112a62c65204a59a8e2fafa23485881d67d9cc7027a82dbb00192664ed0056170fd
|
data/test/unit/semver_test.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
require "test/unit"
|
2
|
-
require 'semantic'
|
3
|
-
require 'socialcrawler'
|
4
|
-
require 'test_helper'
|
5
|
-
|
6
|
-
class VersioningTest < Test::Unit::TestCase
|
7
|
-
|
8
|
-
def test_version
|
9
|
-
s = SocialCrawler::VERSION
|
10
|
-
v = Semantic::Version.new(s)
|
11
|
-
assert_equal v.to_s, s
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|