socialcrawler 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.coveralls.yml +1 -0
- data/.travis.yml +10 -0
- data/Gemfile +4 -1
- data/README.md +6 -2
- data/Rakefile +7 -0
- data/lib/socialcrawler.rb +120 -104
- data/lib/socialcrawler/version.rb +1 -1
- data/test/test_crawler.rb +25 -0
- data/test/test_helper.rb +11 -1
- data/test/test_url.txt +4 -0
- metadata +36 -20
- checksums.yaml +0 -7
- data/test/unit/semver_test.rb +0 -14
data/.coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: travis-ci
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -3,4 +3,7 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in SocialCrawler.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
gem 'nokogiri', '
|
6
|
+
gem 'nokogiri', '1.6.4.1'
|
7
|
+
# test coverage
|
8
|
+
gem 'coveralls', require: false, group: :test
|
9
|
+
gem "codeclimate-test-reporter", group: :test, require: nil
|
data/README.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
|
+
[](https://coveralls.io/r/iceraj/socialcrawler)
|
2
|
+
[](https://travis-ci.org/iceraj/socialcrawler)
|
3
|
+
[](https://codeclimate.com/github/iceraj/socialcrawler)
|
4
|
+
|
1
5
|
# Socialcrawler
|
2
6
|
|
3
|
-
|
7
|
+
SocialCrawler looks for social media links for different sites
|
4
8
|
|
5
9
|
## Installation
|
6
10
|
|
@@ -24,7 +28,7 @@ TODO: Write usage instructions here
|
|
24
28
|
|
25
29
|
## Contributing
|
26
30
|
|
27
|
-
1. Fork it ( https://github.com/
|
31
|
+
1. Fork it ( https://github.com/iceraj/socialcrawler/fork )
|
28
32
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
33
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
34
|
4. Push to the branch (`git push origin my-new-feature`)
|
data/Rakefile
CHANGED
data/lib/socialcrawler.rb
CHANGED
@@ -22,139 +22,155 @@ require 'logger'
|
|
22
22
|
|
23
23
|
module SocialCrawler
|
24
24
|
|
25
|
+
class SocialCrawler
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
log.info( "Multiple values for #{symbol} value #{hash[symbol]}")
|
27
|
+
def initialize
|
28
|
+
@map = {
|
29
|
+
twitter: 'twitter.com/',
|
30
|
+
facebook: 'facebook.com/',
|
31
|
+
google_plus: 'plus.google.com/'
|
32
|
+
}
|
33
33
|
end
|
34
|
-
end
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
if not title.nil?
|
44
|
-
result[:title] = title.text.strip
|
35
|
+
def _put(hash, symbol, value, log=nil)
|
36
|
+
log = Logger.new(STDOUT) if log.nil?
|
37
|
+
if not hash.has_key?(symbol)
|
38
|
+
hash[symbol] = value
|
39
|
+
else
|
40
|
+
hash[symbol] = "#{hash[symbol]} #{value}"
|
41
|
+
log.info("Multiple values for #{symbol} value #{hash[symbol]}")
|
45
42
|
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def page_to_result(page, result, log)
|
46
46
|
links = page.css('a[href]')
|
47
47
|
links.each do |link|
|
48
48
|
link_url = link['href']
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
49
|
+
@map.each do |k, prefix|
|
50
|
+
if not link_url.index(prefix).nil?
|
51
|
+
_put(result, k, link_url, log)
|
52
|
+
end
|
53
53
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def crawl_url(url, log=nil)
|
58
|
+
log = Logger.new(STDOUT) if log.nil?
|
59
|
+
log.info("Crawling #{url}")
|
60
|
+
result = Hash.new('NOT FOUND')
|
61
|
+
begin
|
62
|
+
page = Nokogiri::HTML(open(url))
|
63
|
+
title = page.css('title')
|
64
|
+
if not title.nil?
|
65
|
+
result[:title] = title.text.strip
|
57
66
|
end
|
58
|
-
|
59
|
-
|
60
|
-
|
67
|
+
page_to_result(page, result, log)
|
68
|
+
result[:url] = url
|
69
|
+
result[:success] = true
|
70
|
+
result[:message] = ''
|
71
|
+
rescue Exception => e
|
72
|
+
result[:url] = url
|
73
|
+
result[:success] = false
|
74
|
+
result[:message] = "#{e}"
|
75
|
+
end
|
76
|
+
return result
|
77
|
+
end
|
78
|
+
|
79
|
+
def load_status_cache(status_filename, log=nil)
|
80
|
+
status = Hash.new
|
81
|
+
if not status_filename.nil? and File.exists?(status_filename)
|
82
|
+
log.info("Loading previous status from #{status_filename}")
|
83
|
+
CSV.foreach(status_filename) do |row|
|
84
|
+
if row.count < 3
|
85
|
+
next
|
86
|
+
end
|
87
|
+
url = row[0]
|
88
|
+
result = row[1]
|
89
|
+
message = row[2]
|
90
|
+
status[url] = {
|
91
|
+
:url => url,
|
92
|
+
:result => result,
|
93
|
+
:message => message
|
94
|
+
}
|
61
95
|
end
|
96
|
+
log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
|
62
97
|
end
|
63
|
-
|
64
|
-
result[:success] = true
|
65
|
-
result[:message] = ''
|
66
|
-
rescue Exception => e
|
67
|
-
result[:url] = url
|
68
|
-
result[:success] = false
|
69
|
-
result[:message] = "#{e}"
|
98
|
+
return status
|
70
99
|
end
|
71
|
-
return result
|
72
|
-
end
|
73
100
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
CSV.foreach(
|
81
|
-
|
101
|
+
def load_output_cache(output_list_filename, log=nil)
|
102
|
+
data = Hash.new()
|
103
|
+
log.info("Loading previous status from #{output_list_filename}")
|
104
|
+
if not File.exist?(output_list_filename)
|
105
|
+
return data
|
106
|
+
end
|
107
|
+
CSV.foreach(output_list_filename) do |row|
|
108
|
+
log.info("Loading #{row} #{row.count}")
|
109
|
+
if row.count < 5
|
110
|
+
next
|
111
|
+
end
|
82
112
|
url = row[0]
|
83
|
-
|
84
|
-
|
85
|
-
|
113
|
+
title= row[1]
|
114
|
+
twitter = row[2]
|
115
|
+
facebook = row[3]
|
116
|
+
google_plus = row[4]
|
117
|
+
data[url] = {
|
86
118
|
:url => url,
|
87
|
-
:
|
88
|
-
:
|
119
|
+
:title => title,
|
120
|
+
:twitter => twitter,
|
121
|
+
:facebook => facebook,
|
122
|
+
:google_plus => google_plus
|
89
123
|
}
|
90
|
-
|
91
|
-
log.info("Exception reading file #{e}")
|
92
|
-
end
|
124
|
+
log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
93
125
|
end
|
94
|
-
|
126
|
+
return data
|
95
127
|
end
|
96
128
|
|
97
|
-
|
98
|
-
|
99
|
-
log.info(
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
129
|
+
def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
|
130
|
+
log = Logger.new(STDOUT) if log.nil?
|
131
|
+
log.info("Crawler started")
|
132
|
+
|
133
|
+
status = load_status_cache(status_filename, log)
|
134
|
+
|
135
|
+
data = load_output_cache(output_list_filename, log)
|
136
|
+
|
137
|
+
CSV.open(output_list_filename, "wb") do |output|
|
138
|
+
data.each do |k, v|
|
139
|
+
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
140
|
+
end
|
141
|
+
CSV.open(status_filename, "wb") do |status_line|
|
142
|
+
status.each do |k, v|
|
143
|
+
status_line << [k, v[:success], v[:message]]
|
144
|
+
end
|
145
|
+
crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
146
|
+
end
|
115
147
|
end
|
116
|
-
log.info( "Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
117
148
|
end
|
118
149
|
|
119
|
-
|
150
|
+
def crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
151
|
+
CSV.foreach(domain_list_filename) do |row|
|
120
152
|
url = row[0]
|
121
153
|
if status.has_key?(url)
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
log.info(k)
|
129
|
-
log.info(v)
|
130
|
-
output << [k,v[:title],v[:twitter],v[:facebook],v[:google_plus]]
|
131
|
-
end
|
132
|
-
output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
|
133
|
-
data[url] = result
|
134
|
-
end
|
135
|
-
status[url] = {
|
136
|
-
:url => url,
|
137
|
-
:result => 'success',
|
138
|
-
:message => ''
|
139
|
-
}
|
140
|
-
CSV.open( status_filename, "wb" ) do |status_line|
|
141
|
-
status_line << [url,'success','']
|
142
|
-
end
|
143
|
-
else
|
144
|
-
status[url] = {
|
145
|
-
:url => url,
|
146
|
-
:result => result[:success],
|
147
|
-
:message => result[:message]
|
148
|
-
}
|
149
|
-
CSV.open( status_filename, "wb" ) do |status_line|
|
150
|
-
status_line << [url,result[:success],result[:message]]
|
151
|
-
end
|
154
|
+
next
|
155
|
+
end
|
156
|
+
result = crawl_url(url, log)
|
157
|
+
if result[:success] == true
|
158
|
+
data[url] = result
|
159
|
+
output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
|
152
160
|
end
|
161
|
+
status[url] = {
|
162
|
+
:url => url,
|
163
|
+
:result => result[:success],
|
164
|
+
:message => result[:message]
|
165
|
+
}
|
166
|
+
status_line << [url, result[:success], result[:message]]
|
153
167
|
end
|
154
168
|
end
|
155
169
|
end
|
156
170
|
end
|
157
171
|
|
158
172
|
if __FILE__ == $0
|
159
|
-
|
173
|
+
#:nocov:
|
174
|
+
SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
|
175
|
+
#:nocov:
|
160
176
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
require "test/unit"
|
3
|
+
require 'semantic'
|
4
|
+
require 'socialcrawler'
|
5
|
+
|
6
|
+
class CrawlewrTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_version
|
9
|
+
s = SocialCrawler::VERSION
|
10
|
+
v = Semantic::Version.new(s)
|
11
|
+
assert_equal v.to_s, s
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_1
|
15
|
+
File.delete('/tmp/test_out.txt') if File.exists?('/tmp/test_out.txt')
|
16
|
+
File.delete('/tmp/test_status.txt') if File.exists?('/tmp/test_status.txt')
|
17
|
+
|
18
|
+
sc = SocialCrawler::SocialCrawler.new
|
19
|
+
sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
|
20
|
+
|
21
|
+
sc = SocialCrawler::SocialCrawler.new
|
22
|
+
sc.crawl('test/test_url.txt', '/tmp/test_out.txt', '/tmp/test_status.txt')
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
data/test/test_helper.rb
CHANGED
@@ -1,2 +1,12 @@
|
|
1
|
+
require "codeclimate-test-reporter"
|
1
2
|
require 'simplecov'
|
2
|
-
|
3
|
+
require 'coveralls'
|
4
|
+
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
5
|
+
SimpleCov::Formatter::HTMLFormatter,
|
6
|
+
Coveralls::SimpleCov::Formatter
|
7
|
+
]
|
8
|
+
CodeClimate::TestReporter.start
|
9
|
+
SimpleCov.start
|
10
|
+
puts "Simple Coverage Started"
|
11
|
+
|
12
|
+
|
data/test/test_url.txt
ADDED
metadata
CHANGED
@@ -1,83 +1,94 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socialcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Ivica Ceraj
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
12
|
+
date: 2015-01-16 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: bundler
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
|
-
- -
|
19
|
+
- - ~>
|
18
20
|
- !ruby/object:Gem::Version
|
19
21
|
version: '1.7'
|
20
22
|
type: :development
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
|
-
- -
|
27
|
+
- - ~>
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '1.7'
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: rake
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- -
|
35
|
+
- - ~>
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '10.0'
|
34
38
|
type: :development
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- -
|
43
|
+
- - ~>
|
39
44
|
- !ruby/object:Gem::Version
|
40
45
|
version: '10.0'
|
41
46
|
- !ruby/object:Gem::Dependency
|
42
47
|
name: semantic
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
44
50
|
requirements:
|
45
|
-
- -
|
51
|
+
- - ~>
|
46
52
|
- !ruby/object:Gem::Version
|
47
53
|
version: '1.0'
|
48
54
|
type: :development
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
51
58
|
requirements:
|
52
|
-
- -
|
59
|
+
- - ~>
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: '1.0'
|
55
62
|
- !ruby/object:Gem::Dependency
|
56
63
|
name: simplecov
|
57
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
58
66
|
requirements:
|
59
|
-
- -
|
67
|
+
- - ~>
|
60
68
|
- !ruby/object:Gem::Version
|
61
69
|
version: '0.9'
|
62
70
|
type: :development
|
63
71
|
prerelease: false
|
64
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
65
74
|
requirements:
|
66
|
-
- -
|
75
|
+
- - ~>
|
67
76
|
- !ruby/object:Gem::Version
|
68
77
|
version: '0.9'
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: simplecov-html
|
71
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
72
82
|
requirements:
|
73
|
-
- -
|
83
|
+
- - ~>
|
74
84
|
- !ruby/object:Gem::Version
|
75
85
|
version: '0.8'
|
76
86
|
type: :development
|
77
87
|
prerelease: false
|
78
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
79
90
|
requirements:
|
80
|
-
- -
|
91
|
+
- - ~>
|
81
92
|
- !ruby/object:Gem::Version
|
82
93
|
version: '0.8'
|
83
94
|
description: It read file containing list of urls and produces output file with domain,
|
@@ -88,7 +99,9 @@ executables: []
|
|
88
99
|
extensions: []
|
89
100
|
extra_rdoc_files: []
|
90
101
|
files:
|
91
|
-
-
|
102
|
+
- .coveralls.yml
|
103
|
+
- .gitignore
|
104
|
+
- .travis.yml
|
92
105
|
- Gemfile
|
93
106
|
- LICENSE.txt
|
94
107
|
- README.md
|
@@ -96,32 +109,35 @@ files:
|
|
96
109
|
- lib/socialcrawler.rb
|
97
110
|
- lib/socialcrawler/version.rb
|
98
111
|
- socialcrawler.gemspec
|
112
|
+
- test/test_crawler.rb
|
99
113
|
- test/test_helper.rb
|
100
|
-
- test/
|
114
|
+
- test/test_url.txt
|
101
115
|
homepage: http://github.com/iceraj/socialcrawler
|
102
116
|
licenses:
|
103
117
|
- LGPL 2.1
|
104
|
-
metadata: {}
|
105
118
|
post_install_message:
|
106
119
|
rdoc_options: []
|
107
120
|
require_paths:
|
108
121
|
- lib
|
109
122
|
required_ruby_version: !ruby/object:Gem::Requirement
|
123
|
+
none: false
|
110
124
|
requirements:
|
111
|
-
- -
|
125
|
+
- - ! '>='
|
112
126
|
- !ruby/object:Gem::Version
|
113
127
|
version: '0'
|
114
128
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
115
130
|
requirements:
|
116
|
-
- -
|
131
|
+
- - ! '>='
|
117
132
|
- !ruby/object:Gem::Version
|
118
133
|
version: '0'
|
119
134
|
requirements: []
|
120
135
|
rubyforge_project:
|
121
|
-
rubygems_version:
|
136
|
+
rubygems_version: 1.8.24
|
122
137
|
signing_key:
|
123
|
-
specification_version:
|
138
|
+
specification_version: 3
|
124
139
|
summary: SocialCrawler looks for social media links for different sites
|
125
140
|
test_files:
|
141
|
+
- test/test_crawler.rb
|
126
142
|
- test/test_helper.rb
|
127
|
-
- test/
|
143
|
+
- test/test_url.txt
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: 1bfd2a9261b07fb456185b808c9f532128f0fcdc
|
4
|
-
data.tar.gz: 42f27c1214c3b8e850a6b9106bc0002a0f76d862
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: 21878df5ce6b2c75e3af24e4f2b950076098af255c88e900d639e3852eedb21587d485c83271df3b08bd2d6a2f363d2faf8e9052b3d497431f46b7f195433de3
|
7
|
-
data.tar.gz: e59148b275a58ec7ad5b692c25f030e94ce2ed585b5b69010d39df19fb17b112a62c65204a59a8e2fafa23485881d67d9cc7027a82dbb00192664ed0056170fd
|
data/test/unit/semver_test.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
require "test/unit"
|
2
|
-
require 'semantic'
|
3
|
-
require 'socialcrawler'
|
4
|
-
require 'test_helper'
|
5
|
-
|
6
|
-
class VersioningTest < Test::Unit::TestCase
|
7
|
-
|
8
|
-
def test_version
|
9
|
-
s = SocialCrawler::VERSION
|
10
|
-
v = Semantic::Version.new(s)
|
11
|
-
assert_equal v.to_s, s
|
12
|
-
end
|
13
|
-
|
14
|
-
end
|