socialcrawler 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -3
- data/Gemfile +1 -0
- data/lib/socialcrawler.rb +47 -27
- data/lib/socialcrawler/version.rb +1 -1
- data/test/test_helper.rb +0 -2
- metadata +3 -4
- data/test_status.txt +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 479202e48815c59eb775fc309eb5f771b27c7c16
|
4
|
+
data.tar.gz: b085817b3960a9397e0379561ecb27e81ea1fcd6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38f539ee81079a7000ca42b6074caf4269db86a41992952d40b243d3f64970fd5319acad8aea8f488b113ff199438a986d8c51a407bc293aa92c8d184f2718a5
|
7
|
+
data.tar.gz: 93806c94207115dbe3ea49f0f6192eb8e9ad36fa853a13c2ff7fbfd247b5e9247fed6a975f1eb53ef3130c637686c0dd4eec7518e50c51f99cbfc595f267af99
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/lib/socialcrawler.rb
CHANGED
@@ -28,7 +28,12 @@ module SocialCrawler
|
|
28
28
|
@map = {
|
29
29
|
twitter: 'twitter.com/',
|
30
30
|
facebook: 'facebook.com/',
|
31
|
-
google_plus: 'plus.google.com/'
|
31
|
+
google_plus: 'plus.google.com/',
|
32
|
+
instagram: 'www.instagram.com',
|
33
|
+
you_tube: 'youtube.com/user',
|
34
|
+
pinterest: 'pinterest.com/',
|
35
|
+
linked_in: 'linkedin.com/',
|
36
|
+
flickr: 'flickr.com/'
|
32
37
|
}
|
33
38
|
end
|
34
39
|
|
@@ -57,7 +62,7 @@ module SocialCrawler
|
|
57
62
|
def crawl_url(url, log=nil)
|
58
63
|
log = Logger.new(STDOUT) if log.nil?
|
59
64
|
log.info("Crawling #{url}")
|
60
|
-
result = Hash.new(
|
65
|
+
result = Hash.new(:NOT_FOUND)
|
61
66
|
begin
|
62
67
|
page = Nokogiri::HTML(open(url))
|
63
68
|
title = page.css('title')
|
@@ -81,13 +86,7 @@ module SocialCrawler
|
|
81
86
|
if not status_filename.nil? and File.exists?(status_filename)
|
82
87
|
log.info("Loading previous status from #{status_filename}")
|
83
88
|
CSV.foreach(status_filename) do |row|
|
84
|
-
|
85
|
-
status[row[0]] = {
|
86
|
-
:url => row[0],
|
87
|
-
:result => row[1],
|
88
|
-
:message => row[2]
|
89
|
-
}
|
90
|
-
end
|
89
|
+
set_status_cache_data(status, row)
|
91
90
|
end
|
92
91
|
log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
|
93
92
|
end
|
@@ -101,16 +100,7 @@ module SocialCrawler
|
|
101
100
|
return data
|
102
101
|
end
|
103
102
|
CSV.foreach(output_list_filename) do |row|
|
104
|
-
|
105
|
-
if row.count >= 5
|
106
|
-
data[row[0]] = {
|
107
|
-
:url => row[0],
|
108
|
-
:title => row[1],
|
109
|
-
:twitter => row[2],
|
110
|
-
:facebook => row[3],
|
111
|
-
:google_plus => row[4]
|
112
|
-
}
|
113
|
-
end
|
103
|
+
set_output_cache_data(data, row)
|
114
104
|
log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
115
105
|
end
|
116
106
|
return data
|
@@ -125,13 +115,9 @@ module SocialCrawler
|
|
125
115
|
data = load_output_cache(output_list_filename, log)
|
126
116
|
|
127
117
|
CSV.open(output_list_filename, "wb") do |output|
|
128
|
-
data
|
129
|
-
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
130
|
-
end
|
118
|
+
write_data(data, output)
|
131
119
|
CSV.open(status_filename, "wb") do |status_line|
|
132
|
-
status
|
133
|
-
status_line << [k, v[:success], v[:message]]
|
134
|
-
end
|
120
|
+
write_status(status, status_line)
|
135
121
|
crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
136
122
|
end
|
137
123
|
end
|
@@ -151,6 +137,18 @@ module SocialCrawler
|
|
151
137
|
|
152
138
|
private
|
153
139
|
|
140
|
+
def write_data(data, output)
|
141
|
+
data.each do |k, v|
|
142
|
+
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def write_status(status, status_line)
|
147
|
+
status.each do |k, v|
|
148
|
+
status_line << [k, v[:success], v[:message]]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
154
152
|
def set_data(result, url, data, output)
|
155
153
|
if result[:success] == true
|
156
154
|
data[url] = result
|
@@ -166,11 +164,33 @@ module SocialCrawler
|
|
166
164
|
}
|
167
165
|
status_line << [url, result[:success], result[:message]]
|
168
166
|
end
|
167
|
+
|
168
|
+
def set_output_cache_data(data, row)
|
169
|
+
if row.count >= 5
|
170
|
+
data[row[0]] = {
|
171
|
+
:url => row[0],
|
172
|
+
:title => row[1],
|
173
|
+
:twitter => row[2],
|
174
|
+
:facebook => row[3],
|
175
|
+
:google_plus => row[4]
|
176
|
+
}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def set_status_cache_data(status, row)
|
181
|
+
if row.count >= 3
|
182
|
+
status[row[0]] = {
|
183
|
+
:url => row[0],
|
184
|
+
:result => row[1],
|
185
|
+
:message => row[2]
|
186
|
+
}
|
187
|
+
end
|
188
|
+
end
|
169
189
|
end
|
170
190
|
end
|
171
191
|
|
172
192
|
if __FILE__ == $0
|
173
|
-
|
193
|
+
#:nocov:
|
174
194
|
SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
|
175
|
-
|
195
|
+
#:nocov:
|
176
196
|
end
|
data/test/test_helper.rb
CHANGED
@@ -1,11 +1,9 @@
|
|
1
|
-
require "codeclimate-test-reporter"
|
2
1
|
require 'simplecov'
|
3
2
|
require 'coveralls'
|
4
3
|
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
5
4
|
SimpleCov::Formatter::HTMLFormatter,
|
6
5
|
Coveralls::SimpleCov::Formatter
|
7
6
|
]
|
8
|
-
CodeClimate::TestReporter.start
|
9
7
|
SimpleCov.start
|
10
8
|
puts "Simple Coverage Started"
|
11
9
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socialcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivica Ceraj
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -101,7 +101,6 @@ files:
|
|
101
101
|
- test/test_crawler.rb
|
102
102
|
- test/test_helper.rb
|
103
103
|
- test/test_url.txt
|
104
|
-
- test_status.txt
|
105
104
|
homepage: http://github.com/iceraj/socialcrawler
|
106
105
|
licenses:
|
107
106
|
- LGPL 2.1
|
@@ -122,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
121
|
version: '0'
|
123
122
|
requirements: []
|
124
123
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.5.1
|
126
125
|
signing_key:
|
127
126
|
specification_version: 4
|
128
127
|
summary: SocialCrawler looks for social media links for different sites
|
data/test_status.txt
DELETED