socialcrawler 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -3
- data/Gemfile +1 -0
- data/lib/socialcrawler.rb +47 -27
- data/lib/socialcrawler/version.rb +1 -1
- data/test/test_helper.rb +0 -2
- metadata +3 -4
- data/test_status.txt +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 479202e48815c59eb775fc309eb5f771b27c7c16
|
4
|
+
data.tar.gz: b085817b3960a9397e0379561ecb27e81ea1fcd6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38f539ee81079a7000ca42b6074caf4269db86a41992952d40b243d3f64970fd5319acad8aea8f488b113ff199438a986d8c51a407bc293aa92c8d184f2718a5
|
7
|
+
data.tar.gz: 93806c94207115dbe3ea49f0f6192eb8e9ad36fa853a13c2ff7fbfd247b5e9247fed6a975f1eb53ef3130c637686c0dd4eec7518e50c51f99cbfc595f267af99
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/lib/socialcrawler.rb
CHANGED
@@ -28,7 +28,12 @@ module SocialCrawler
|
|
28
28
|
@map = {
|
29
29
|
twitter: 'twitter.com/',
|
30
30
|
facebook: 'facebook.com/',
|
31
|
-
google_plus: 'plus.google.com/'
|
31
|
+
google_plus: 'plus.google.com/',
|
32
|
+
instagram: 'www.instagram.com',
|
33
|
+
you_tube: 'youtube.com/user',
|
34
|
+
pinterest: 'pinterest.com/',
|
35
|
+
linked_in: 'linkedin.com/',
|
36
|
+
flickr: 'flickr.com/'
|
32
37
|
}
|
33
38
|
end
|
34
39
|
|
@@ -57,7 +62,7 @@ module SocialCrawler
|
|
57
62
|
def crawl_url(url, log=nil)
|
58
63
|
log = Logger.new(STDOUT) if log.nil?
|
59
64
|
log.info("Crawling #{url}")
|
60
|
-
result = Hash.new(
|
65
|
+
result = Hash.new(:NOT_FOUND)
|
61
66
|
begin
|
62
67
|
page = Nokogiri::HTML(open(url))
|
63
68
|
title = page.css('title')
|
@@ -81,13 +86,7 @@ module SocialCrawler
|
|
81
86
|
if not status_filename.nil? and File.exists?(status_filename)
|
82
87
|
log.info("Loading previous status from #{status_filename}")
|
83
88
|
CSV.foreach(status_filename) do |row|
|
84
|
-
|
85
|
-
status[row[0]] = {
|
86
|
-
:url => row[0],
|
87
|
-
:result => row[1],
|
88
|
-
:message => row[2]
|
89
|
-
}
|
90
|
-
end
|
89
|
+
set_status_cache_data(status, row)
|
91
90
|
end
|
92
91
|
log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
|
93
92
|
end
|
@@ -101,16 +100,7 @@ module SocialCrawler
|
|
101
100
|
return data
|
102
101
|
end
|
103
102
|
CSV.foreach(output_list_filename) do |row|
|
104
|
-
|
105
|
-
if row.count >= 5
|
106
|
-
data[row[0]] = {
|
107
|
-
:url => row[0],
|
108
|
-
:title => row[1],
|
109
|
-
:twitter => row[2],
|
110
|
-
:facebook => row[3],
|
111
|
-
:google_plus => row[4]
|
112
|
-
}
|
113
|
-
end
|
103
|
+
set_output_cache_data(data, row)
|
114
104
|
log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
|
115
105
|
end
|
116
106
|
return data
|
@@ -125,13 +115,9 @@ module SocialCrawler
|
|
125
115
|
data = load_output_cache(output_list_filename, log)
|
126
116
|
|
127
117
|
CSV.open(output_list_filename, "wb") do |output|
|
128
|
-
data
|
129
|
-
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
130
|
-
end
|
118
|
+
write_data(data, output)
|
131
119
|
CSV.open(status_filename, "wb") do |status_line|
|
132
|
-
status
|
133
|
-
status_line << [k, v[:success], v[:message]]
|
134
|
-
end
|
120
|
+
write_status(status, status_line)
|
135
121
|
crawl_loop(data, domain_list_filename, log, output, status, status_line)
|
136
122
|
end
|
137
123
|
end
|
@@ -151,6 +137,18 @@ module SocialCrawler
|
|
151
137
|
|
152
138
|
private
|
153
139
|
|
140
|
+
def write_data(data, output)
|
141
|
+
data.each do |k, v|
|
142
|
+
output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def write_status(status, status_line)
|
147
|
+
status.each do |k, v|
|
148
|
+
status_line << [k, v[:success], v[:message]]
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
154
152
|
def set_data(result, url, data, output)
|
155
153
|
if result[:success] == true
|
156
154
|
data[url] = result
|
@@ -166,11 +164,33 @@ module SocialCrawler
|
|
166
164
|
}
|
167
165
|
status_line << [url, result[:success], result[:message]]
|
168
166
|
end
|
167
|
+
|
168
|
+
def set_output_cache_data(data, row)
|
169
|
+
if row.count >= 5
|
170
|
+
data[row[0]] = {
|
171
|
+
:url => row[0],
|
172
|
+
:title => row[1],
|
173
|
+
:twitter => row[2],
|
174
|
+
:facebook => row[3],
|
175
|
+
:google_plus => row[4]
|
176
|
+
}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def set_status_cache_data(status, row)
|
181
|
+
if row.count >= 3
|
182
|
+
status[row[0]] = {
|
183
|
+
:url => row[0],
|
184
|
+
:result => row[1],
|
185
|
+
:message => row[2]
|
186
|
+
}
|
187
|
+
end
|
188
|
+
end
|
169
189
|
end
|
170
190
|
end
|
171
191
|
|
172
192
|
if __FILE__ == $0
|
173
|
-
|
193
|
+
#:nocov:
|
174
194
|
SocialCrawler::SocialCrawler.new.crawl(ARGV[0], ARGV[1], ARGV[2])
|
175
|
-
|
195
|
+
#:nocov:
|
176
196
|
end
|
data/test/test_helper.rb
CHANGED
@@ -1,11 +1,9 @@
|
|
1
|
-
require "codeclimate-test-reporter"
|
2
1
|
require 'simplecov'
|
3
2
|
require 'coveralls'
|
4
3
|
SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter[
|
5
4
|
SimpleCov::Formatter::HTMLFormatter,
|
6
5
|
Coveralls::SimpleCov::Formatter
|
7
6
|
]
|
8
|
-
CodeClimate::TestReporter.start
|
9
7
|
SimpleCov.start
|
10
8
|
puts "Simple Coverage Started"
|
11
9
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: socialcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivica Ceraj
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -101,7 +101,6 @@ files:
|
|
101
101
|
- test/test_crawler.rb
|
102
102
|
- test/test_helper.rb
|
103
103
|
- test/test_url.txt
|
104
|
-
- test_status.txt
|
105
104
|
homepage: http://github.com/iceraj/socialcrawler
|
106
105
|
licenses:
|
107
106
|
- LGPL 2.1
|
@@ -122,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
121
|
version: '0'
|
123
122
|
requirements: []
|
124
123
|
rubyforge_project:
|
125
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.5.1
|
126
125
|
signing_key:
|
127
126
|
specification_version: 4
|
128
127
|
summary: SocialCrawler looks for social media links for different sites
|
data/test_status.txt
DELETED