contentar 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: effbdf214b6d437503e5a3141b2f86533e98765f
4
+ data.tar.gz: dac8cada25f67a7d7e44e852462f5fe17edfad9f
5
+ SHA512:
6
+ metadata.gz: b4baefd9e42b09e98665c73101049726bdd4981e6a9ac108e5d8cdd894a44a7cc7c2e69a33d6f6d5213838d660bed7c69b2b60b102d6121c2d707fd2703aa69c
7
+ data.tar.gz: 8e68451b902d4ad243d2a14a517f0a4f1eb2e8464976446b5ea6dd02d26ef58b3d33afcd76219470a9f91dda28352776d4b637c7e35eced80dc6f0df8af622fc
data/bin/contentar ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'contentar'
4
+
5
+ url = ARGV[0]
6
+ # raise 'Needs an URL' unlesss url.length > 0
7
+ c = Contentar.new(url)
8
+ c.save_data
data/lib/contentar.rb ADDED
@@ -0,0 +1,32 @@
1
+ class Contentar
2
+
3
+ attr_reader :baseurl, :spider, :crawler, :saver
4
+ def initialize(baseurl)
5
+ @baseurl = baseurl
6
+ @spider = Spider.new(baseurl)
7
+ @crawler = Crawler.new(spider.get_data)
8
+ end
9
+
10
+ def save_data
11
+ DataSaver.csv(csv_filename, crawler.get_data)
12
+ end
13
+
14
+ private
15
+
16
+ def csv_filename
17
+ baseurl.gsub('http://', '').gsub('www.', '')
18
+ .gsub('/', '')
19
+ end
20
+ end
21
+
22
+ require 'spidr'
23
+ require 'json'
24
+ require 'csv'
25
+ require 'rest-client'
26
+ require 'dotenv'
27
+
28
+ Dotenv.load
29
+
30
+ autoload(:DataGetter, './lib/data_getters/data_getter.rb')
31
+
32
+ Dir["./lib/**/*.rb"].each { |file| require file }
data/lib/crawler.rb ADDED
@@ -0,0 +1,34 @@
1
+ class Crawler
2
+ attr_reader :data
3
+ def initialize(data)
4
+ @data = data
5
+ end
6
+
7
+ def get_data
8
+ data.each_with_index.inject([]) do |updated_data, (page_data, index)|
9
+ updated_data << get_page_data(page_data, index)
10
+ updated_data
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def get_page_data(page_data, index)
17
+ title = page_data.fetch(:title) { '' }
18
+ progress_message(index, title)
19
+ page_stats = PageStats.new(page_data.fetch(:url))
20
+ page_data.merge(get_page_stats(page_stats))
21
+ end
22
+
23
+ def progress_message(index, title)
24
+ print "Fetching page #{ index + 1 }: \t\t#{ title.to_s.strip }\n"
25
+ end
26
+
27
+ def get_page_stats(page_stats)
28
+ begin
29
+ page_stats.data
30
+ rescue Exception => e
31
+ { error: e.message }
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,17 @@
1
+ class ArticleDataGetter < DataGetter
2
+ attr_reader :content
3
+
4
+ def initialize(content)
5
+ @content = content
6
+ @processor = ContentDataProcessor.new
7
+ @api_call = 'article'
8
+ @values = article_values
9
+ super
10
+ end
11
+
12
+ private
13
+
14
+ def article_values
15
+ { async: false, data: { content: content, obey_robots: false } }.to_json
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ class ContentDataGetter < DataGetter
2
+ def initialize(url)
3
+ @url = url
4
+ @processor = ContentDataProcessor.new
5
+ @api_call = 'fetch'
6
+ @values = content_values
7
+ super
8
+ end
9
+
10
+ private
11
+
12
+ def content_values
13
+ { async: false, data: { url: url, obey_robots: false } }.to_json
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ class DataGetter
2
+ API = 'https://api.engine.priceonomics.com/v1/apps/'
3
+
4
+ attr_accessor :processor, :url, :api_call, :values
5
+
6
+ attr_reader :headers
7
+ def initialize(url)
8
+ @headers = { x_access_key: ENV['PRICE_ACCESS_KEY'] }
9
+ end
10
+
11
+ def data
12
+ return error_process(response_data) if is_error?(response_data)
13
+ processor.data(response_data)
14
+ end
15
+
16
+ private
17
+
18
+ def response_data
19
+ @response_data ||= attempt_get
20
+ end
21
+
22
+ def attempt_get
23
+ begin
24
+ get
25
+ rescue RestClient::RequestTimeout, RestClient::InternalServerError => error
26
+ error_data(error.message)
27
+ end
28
+ end
29
+
30
+ def get
31
+ RestClient.post("#{ API }#{ api_call }", values, headers)
32
+ end
33
+
34
+ def error_data(error)
35
+ { error: error }.to_json
36
+ end
37
+
38
+ def is_error?(response)
39
+ parsed = JSON.parse(response)
40
+ parsed.fetch('error') { false }
41
+ end
42
+
43
+ def error_process(data)
44
+ JSON.parse(data)
45
+ end
46
+ end
@@ -0,0 +1,15 @@
1
+ class ReadingLevelDataGetter < DataGetter
2
+ def initialize(url)
3
+ @url = url
4
+ @processor = ReadingLevelDataProcessor.new
5
+ @api_call = 'readinglevel'
6
+ @values = reading_values
7
+ super
8
+ end
9
+
10
+ private
11
+
12
+ def reading_values
13
+ { async: false, data: { content: 'content' } }.to_json
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ class SocialDataGetter < DataGetter
2
+ def initialize(url)
3
+ @url = url
4
+ @processor = SocialDataProcessor.new
5
+ @api_call = 'social'
6
+ @values = { 'async' => false, 'data' => { 'url' => url } }.to_json
7
+ super
8
+ end
9
+ end
@@ -0,0 +1,22 @@
1
+ # IF YOU STILL DON'T NEED THIS DELETE IT!
2
+
3
+ # class ArticleDataProcessor
4
+ # attr_reader :json_data, :parsed_data
5
+
6
+ # def data(json_data)
7
+ # @json_data = json_data
8
+ # @parsed_data = parse_data
9
+ # process
10
+ # end
11
+
12
+ # private
13
+
14
+ # def process
15
+ # article = parsed_data.fetch('content') { '' }
16
+ # { article: article }
17
+ # end
18
+
19
+ # def parse_data
20
+ # JSON.parse(json_data).fetch('data') { {} }
21
+ # end
22
+ # end
@@ -0,0 +1,20 @@
1
+ class ContentDataProcessor
2
+ attr_reader :json_data, :parsed_data
3
+
4
+ def data(json_data)
5
+ @json_data = json_data
6
+ @parsed_data = parse_data
7
+ process
8
+ end
9
+
10
+ private
11
+
12
+ def process
13
+ content = parsed_data.fetch('content') { '' }
14
+ { content: content }
15
+ end
16
+
17
+ def parse_data
18
+ JSON.parse(json_data).fetch('data') { {} }
19
+ end
20
+ end
@@ -0,0 +1,50 @@
1
+ class ReadingLevelDataProcessor
2
+ attr_reader :json_data, :parsed_data
3
+
4
+ def data(json_data)
5
+ @json_data = json_data
6
+ @parsed_data = parse_data
7
+ process
8
+ end
9
+
10
+ private
11
+
12
+ def process
13
+ {
14
+ composite_reading_level: composite,
15
+ ari_reading_level: ari,
16
+ coleman_liau_reading_level: coleman_liau_reading_level,
17
+ flesch_kincaid_reading_level: flesch_kincaid_reading_level,
18
+ gunning_fog_reading_level: gunning_fog_reading_level,
19
+ smog_reading_level: smog_reading_level
20
+ }
21
+ end
22
+
23
+ def parse_data
24
+ JSON.parse(json_data).fetch('data') { {} }
25
+ end
26
+
27
+ def composite
28
+ parsed_data.fetch('composite') { 0 }
29
+ end
30
+
31
+ def ari
32
+ parsed_data.fetch('ari') { 0 }
33
+ end
34
+
35
+ def coleman_liau_reading_level
36
+ parsed_data.fetch('coleman-liau') { 0 }
37
+ end
38
+
39
+ def flesch_kincaid_reading_level
40
+ parsed_data.fetch('flesch-kincaid') { 0 }
41
+ end
42
+
43
+ def gunning_fog_reading_level
44
+ parsed_data.fetch('gunning-fog') { 0 }
45
+ end
46
+
47
+ def smog_reading_level
48
+ parsed_data.fetch('smog') { 0 }
49
+ end
50
+ end
@@ -0,0 +1,87 @@
1
+ class SocialDataProcessor
2
+ attr_reader :json_data, :stats
3
+
4
+ def data(json_data)
5
+ @json_data = json_data
6
+ @stats = get_stats
7
+ process
8
+ end
9
+
10
+ private
11
+
12
+ def process
13
+ {
14
+ stumbleupon_views: stumbleupon_views, reddit_submissions: reddit_submissions,
15
+ reddit_comments: reddit_comments, reddit_score: reddit_score,
16
+ google_plus_shares: google_plus_shares, pinterest_shares: pinterest_shares,
17
+ twitter_shares: twitter_shares, linkedin_shares: linkedin_shares,
18
+ facebook_shares: facebook_shares, facebook_likes: facebook_likes,
19
+ facebook_comments: facebook_comments
20
+ }
21
+ end
22
+
23
+ def get_stats
24
+ dat = JSON.parse(json_data).fetch('data') { return {} }
25
+ dat.fetch('stats') { return dat }
26
+ end
27
+
28
+ def reddit
29
+ stats.fetch('reddit') { {} }
30
+ end
31
+
32
+ def google
33
+ stats.fetch('google+') { {} }
34
+ end
35
+
36
+ def facebook
37
+ stats.fetch('facebook') { {} }
38
+ end
39
+
40
+ def stumbleupon_views
41
+ su = stats.fetch('stumbleupon') { return 0 }
42
+ su.fetch('views') { return 0 }
43
+ end
44
+
45
+ def reddit_submissions
46
+ reddit.fetch('submission_count') { 0 }
47
+ end
48
+
49
+ def reddit_comments
50
+ reddit.fetch('comment_total') { 0 }
51
+ end
52
+
53
+ def reddit_score
54
+ reddit.fetch('score_total') { 0 }
55
+ end
56
+
57
+ def google_plus_shares
58
+ google.fetch('share_count') { 0 }
59
+ end
60
+
61
+ def pinterest_shares
62
+ pinterest = stats.fetch('pinterest') { {} }
63
+ pinterest.fetch('share_count') { 0 }
64
+ end
65
+
66
+ def twitter_shares
67
+ twitter = stats.fetch('twitter') { {} }
68
+ twitter.fetch('share_count') { 0 }
69
+ end
70
+
71
+ def linkedin_shares
72
+ linkedin = stats.fetch('linkedin') { {} }
73
+ linkedin.fetch('share_count') { 0 }
74
+ end
75
+
76
+ def facebook_shares
77
+ facebook.fetch('share_count') { 0 }
78
+ end
79
+
80
+ def facebook_likes
81
+ facebook.fetch('like_count') { 0 }
82
+ end
83
+
84
+ def facebook_comments
85
+ facebook.fetch('comment_count') { 0 }
86
+ end
87
+ end
data/lib/data_saver.rb ADDED
@@ -0,0 +1,23 @@
1
+ module DataSaver
2
+ def self.csv(filename, data)
3
+ create_data_dir
4
+ file = "#{ Dir.pwd }/data/#{ filename }.csv"
5
+ headers = data[0].keys.map { |k| k.to_s }
6
+ create_csv(file, data, headers)
7
+ end
8
+
9
+ private
10
+
11
+ def self.create_csv(file, data, headers)
12
+ CSV.open(file, 'w', write_headers: true, headers: headers, encoding: 'UTF-8') do |csv|
13
+ data.each do |d|
14
+ values = d.values.map { |value| value.to_s.force_encoding('UTF-8') }
15
+ csv << values
16
+ end
17
+ end
18
+ end
19
+
20
+ def self.create_data_dir
21
+ Dir.mkdir('data') unless File.directory?('data')
22
+ end
23
+ end
data/lib/page_stats.rb ADDED
@@ -0,0 +1,38 @@
1
+ class PageStats
2
+
3
+ attr_reader :url
4
+ def initialize(url)
5
+ @url = url
6
+ end
7
+
8
+ def data
9
+ social_data.
10
+ merge(reading_level_data).
11
+ merge(article_data).
12
+ merge(word_count_data)
13
+ end
14
+
15
+ private
16
+
17
+ def social_data
18
+ @social_data ||= SocialDataGetter.new(url).data
19
+ end
20
+
21
+ def reading_level_data
22
+ @reading_level_data ||= ReadingLevelDataGetter.new(url).data
23
+ end
24
+
25
+ def content_data
26
+ @content_data ||= ContentDataGetter.new(url).data
27
+ end
28
+
29
+ def article_data
30
+ content = content_data.fetch(:content) { '' }
31
+ @article_data ||= ArticleDataGetter.new(content).data
32
+ end
33
+
34
+ def word_count_data
35
+ article = article_data.fetch(:article) { '' }
36
+ { word_count: article.length }
37
+ end
38
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,31 @@
1
+ class Spider
2
+ attr_reader :base_path, :data, :ignored_links
3
+ def initialize(base_path)
4
+ @base_path = base_path
5
+ @ignored_links = [/.js/, /.css/]
6
+ @data = []
7
+ end
8
+
9
+ def get_data
10
+ get_site_data
11
+ data
12
+ end
13
+
14
+ private
15
+
16
+ def get_site_data
17
+ Spidr.site(base_path, ignore_links: ignored_links) do |site|
18
+ get_pages_data(site)
19
+ end
20
+ end
21
+
22
+ def get_pages_data(site)
23
+ site.every_page do |page|
24
+ data << get_page_data(page)
25
+ end
26
+ end
27
+
28
+ def get_page_data(page)
29
+ { url: page.url.to_s, title: page.title }
30
+ end
31
+ end
data/readme.md ADDED
File without changes
metadata ADDED
@@ -0,0 +1,158 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: contentar
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Vlad Mehakovic
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rest-client
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: json
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 1.8.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 1.8.2
41
+ - !ruby/object:Gem::Dependency
42
+ name: spidr
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 0.4.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 0.4.1
55
+ - !ruby/object:Gem::Dependency
56
+ name: dotenv
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.0.2
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.0.2
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '='
74
+ - !ruby/object:Gem::Version
75
+ version: 3.2.0
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '='
81
+ - !ruby/object:Gem::Version
82
+ version: 3.2.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '='
88
+ - !ruby/object:Gem::Version
89
+ version: 1.20.4
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '='
95
+ - !ruby/object:Gem::Version
96
+ version: 1.20.4
97
+ - !ruby/object:Gem::Dependency
98
+ name: byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '='
102
+ - !ruby/object:Gem::Version
103
+ version: 3.5.1
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '='
109
+ - !ruby/object:Gem::Version
110
+ version: 3.5.1
111
+ description: A Gem to produce competitive intelligence data
112
+ email: vladiim@yahoo.com.au
113
+ executables:
114
+ - contentar
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - bin/contentar
119
+ - lib/contentar.rb
120
+ - lib/crawler.rb
121
+ - lib/data_getters/article.rb
122
+ - lib/data_getters/content.rb
123
+ - lib/data_getters/data_getter.rb
124
+ - lib/data_getters/reading.rb
125
+ - lib/data_getters/social.rb
126
+ - lib/data_processors/article.rb
127
+ - lib/data_processors/content.rb
128
+ - lib/data_processors/reading_level.rb
129
+ - lib/data_processors/social.rb
130
+ - lib/data_saver.rb
131
+ - lib/page_stats.rb
132
+ - lib/spider.rb
133
+ - readme.md
134
+ homepage: https://github.com/vladiim/contentar
135
+ licenses:
136
+ - MIT
137
+ metadata: {}
138
+ post_install_message:
139
+ rdoc_options: []
140
+ require_paths:
141
+ - lib
142
+ required_ruby_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ required_rubygems_version: !ruby/object:Gem::Requirement
148
+ requirements:
149
+ - - ">="
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ requirements: []
153
+ rubyforge_project:
154
+ rubygems_version: 2.2.2
155
+ signing_key:
156
+ specification_version: 4
157
+ summary: Blah
158
+ test_files: []