site_analyzer 0.3.14 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 358a561c7c98802ee66fc102a2b63987e7c973df
4
- data.tar.gz: f5863c85a3e4656490e9bb1a5a76321cc6dfc1f6
3
+ metadata.gz: 3fea66e1d0fa5a9b34e70289b4e475ff53a2afa2
4
+ data.tar.gz: 66f0019cbee985a9b5568c5f7dee4c9a1aabd984
5
5
  SHA512:
6
- metadata.gz: 424a067b83a5f78cd7080e9126cb31be20495dd86fac0c4a1e6eb7543eb11bc8138303a00766253c261ba7d97ec085b579ab6ec0dcb07e8508f7a676c189b4d8
7
- data.tar.gz: 0a8667179dc3f554b209f75c55764671afe646dd9008f2201f9c5ef644891fc3c571b824ea9f2fda2d1191d0f4e2197d42981080c3d067edd8f519852b69b257
6
+ metadata.gz: 04e6f0c4e25ffc460bd92fa40a2154129a503247c185eb87b7d24c02e5ff0400712983ddc296945f3f3ba0b907a526270417c9595658b1ea0f1a90ad4aad7dda
7
+ data.tar.gz: 403386d6cafffcbbb1034c133337ecfc27dea00c482a61b11f5f6aefb6bcacf18a6edcdcedbceb82411e1afaf8d1eeeeeed05f2adc9faf09e2c0b31783d205e3
data/README.md CHANGED
@@ -26,7 +26,7 @@ require 'site_analyzer'<br>
26
26
 
27
27
  SiteAnalyzer::Report.create site: 'http://savchuk.space', pages: 10, robot: false, console: true<br>
28
28
  Return hash with report.<br>
29
- arguments: site - url mast start from http or https, pages - number of pages to scan, robot - use or not robot.txt file, console - output to console
29
+ arguments: site - url must start from http or https, pages - number of pages to scan, robot - use or not robot.txt file, console - output to console
30
30
 
31
31
  <br>
32
32
  <b>Author</b>
data/SiteAnalyzer.iml CHANGED
@@ -6,15 +6,22 @@
6
6
  <orderEntry type="inheritedJdk" />
7
7
  <orderEntry type="sourceFolder" forTests="false" />
8
8
  <orderEntry type="library" scope="PROVIDED" name="addressable (v2.3.8, rbenv: 2.2.2) [gem]" level="application" />
9
- <orderEntry type="library" scope="PROVIDED" name="bundler (v1.10.5, rbenv: 2.2.2) [gem]" level="application" />
9
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v1.10.6, rbenv: 2.2.2) [gem]" level="application" />
10
+ <orderEntry type="library" scope="PROVIDED" name="coderay (v1.1.0, rbenv: 2.2.2) [gem]" level="application" />
10
11
  <orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.2.5, rbenv: 2.2.2) [gem]" level="application" />
11
12
  <orderEntry type="library" scope="PROVIDED" name="ffi (v1.9.10, rbenv: 2.2.2) [gem]" level="application" />
12
13
  <orderEntry type="library" scope="PROVIDED" name="formatador (v0.2.5, rbenv: 2.2.2) [gem]" level="application" />
14
+ <orderEntry type="library" scope="PROVIDED" name="guard (v2.13.0, rbenv: 2.2.2) [gem]" level="application" />
13
15
  <orderEntry type="library" scope="PROVIDED" name="guard-compat (v1.2.1, rbenv: 2.2.2) [gem]" level="application" />
16
+ <orderEntry type="library" scope="PROVIDED" name="listen (v3.0.3, rbenv: 2.2.2) [gem]" level="application" />
17
+ <orderEntry type="library" scope="PROVIDED" name="lumberjack (v1.0.9, rbenv: 2.2.2) [gem]" level="application" />
14
18
  <orderEntry type="library" scope="PROVIDED" name="mini_portile (v0.6.2, rbenv: 2.2.2) [gem]" level="application" />
15
19
  <orderEntry type="library" scope="PROVIDED" name="nenv (v0.2.0, rbenv: 2.2.2) [gem]" level="application" />
16
20
  <orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.6.6.2, rbenv: 2.2.2) [gem]" level="application" />
21
+ <orderEntry type="library" scope="PROVIDED" name="notiffany (v0.0.7, rbenv: 2.2.2) [gem]" level="application" />
22
+ <orderEntry type="library" scope="PROVIDED" name="pry (v0.10.1, rbenv: 2.2.2) [gem]" level="application" />
17
23
  <orderEntry type="library" scope="PROVIDED" name="rake (v10.4.2, rbenv: 2.2.2) [gem]" level="application" />
24
+ <orderEntry type="library" scope="PROVIDED" name="rb-fsevent (v0.9.5, rbenv: 2.2.2) [gem]" level="application" />
18
25
  <orderEntry type="library" scope="PROVIDED" name="rb-inotify (v0.9.5, rbenv: 2.2.2) [gem]" level="application" />
19
26
  <orderEntry type="library" scope="PROVIDED" name="robotstxt (v0.5.4, rbenv: 2.2.2) [gem]" level="application" />
20
27
  <orderEntry type="library" scope="PROVIDED" name="rspec (v3.3.0, rbenv: 2.2.2) [gem]" level="application" />
@@ -1,30 +1,76 @@
1
1
  module SiteAnalyzer
2
- # Get site page and provide metods for analyse
2
+ # Get site page and provide data for future analyse
3
3
  require 'nokogiri'
4
4
  require 'addressable/uri'
5
5
  require 'timeout'
6
6
  require 'stringex_lite'
7
7
  require 'open-uri'
8
8
  class Page
9
- attr_reader :page_url, :titles, :page, :page_path, :site_domain
9
+ attr_reader :page_url, :page_path, :site_domain,
10
+ :all_titles, :title_good, :title_and_h1_good,
11
+ :meta_description_good, :meta_keywords, :code_less,
12
+ :meta_data, :meta_title_duplicates, :title_h1_h2,
13
+ :have_h2, :page_text_size, :page_a_tags,
14
+ :meta_desc_content, :h2_text, :hlu
15
+ # create page object, fill date and clear don't needed elements
10
16
  def initialize(url)
11
17
  @page_url = url
12
- @page = []
13
- @site_domain = ''
14
- @page_path = ''
15
- @titles = []
16
18
  get_page(url)
17
- fill_data_field!
19
+ fill_data_field! if @page
20
+ clear!
18
21
  end
19
-
20
- def fill_data_field!
21
- @titles = all_titles
22
- end
23
-
22
+ # to_s for report
24
23
  def to_s
25
24
  "Page url: #{@page_url} Site url: #{@site_domain}"
26
25
  end
26
+ # get all home (that on this site) url on page
27
+ def home_a
28
+ if @page_a_tags
29
+ home_a = []
30
+ @page_a_tags.uniq.each do |link|
31
+ uri = URI(link[0].to_ascii) rescue nil #TODO: write additional logic for link to image
32
+ if uri && @site_domain
33
+ home_a << link[0] if uri.host == @site_domain
34
+ end
35
+ end
36
+ home_a
37
+ end
38
+ end
39
+ # get all remote link on page
40
+ def remote_a
41
+ if @page_a_tags
42
+ remote_a = []
43
+ @page_a_tags.uniq.each do |link|
44
+ uri = URI(link[0].to_ascii)
45
+ if uri && @site_domain
46
+ remote_a << link[0] unless uri.host == @site_domain
47
+ end
48
+ end
49
+ remote_a
50
+ end
51
+ end
52
+
53
+ private
27
54
 
55
+ # fill Page instant with data for report
56
+ def fill_data_field!
57
+ @all_titles = titles
58
+ @meta_data = collect_metadates
59
+ @title_h1_h2 = all_titles_h1_h2
60
+ @page_text_size = text_size
61
+ @page_a_tags = all_a_tags
62
+ @meta_desc_content = all_meta_description_content
63
+ @h2_text = h2
64
+ @hlu = bad_url
65
+ @title_good = title_good?
66
+ @title_and_h1_good = title_and_h1_good?
67
+ @meta_description_good = metadescription_good?
68
+ @meta_keywords = keywords_good?
69
+ @code_less = code_less?
70
+ @meta_title_duplicates = metadates_good?
71
+ @have_h2 = h2?
72
+ end
73
+ # get page with open-uri, then parse it with Nokogiri. Get site domain and path from URI
28
74
  def get_page(url)
29
75
  begin
30
76
  timeout(30) do
@@ -37,11 +83,11 @@ module SiteAnalyzer
37
83
  return nil
38
84
  end
39
85
  end
40
-
86
+ # check that title is one and less then 70 symbols
41
87
  def title_good?
42
88
  @page.css('title').size == 1 && @page.css('title').text.size < 70 if @page
43
89
  end
44
- # true if title and h1 have no dublicates
90
+ # true if title and h1 have no duplicates
45
91
  def title_and_h1_good?
46
92
  if @page
47
93
  arr = []
@@ -75,7 +121,7 @@ module SiteAnalyzer
75
121
  true
76
122
  end
77
123
  end
78
- # true if code less then text
124
+ # true if code of page less then text on it
79
125
  def code_less?
80
126
  if @page
81
127
  sum = 0
@@ -86,19 +132,20 @@ module SiteAnalyzer
86
132
  sum < page_text / 2
87
133
  end
88
134
  end
89
-
135
+ # collect meta tags for future report
90
136
  def collect_metadates
91
- @page.css('meta') if @page
137
+ meta = []
138
+ meta = @page.css('meta') if @page
139
+ meta
92
140
  end
93
-
141
+ # check meta and title tags duplicates
94
142
  def metadates_good?
95
143
  if @page
96
- meta_tags = collect_metadates
97
- return false if @page.css('title').size > 1 || meta_tags.nil?
144
+ return false if @all_titles.size > 1 || @meta_data.empty?
98
145
  node_names = []
99
- meta_tags.each { |node| node_names << node['name'] }
100
- return false if node_names.compact!.size < 1
101
- node_names.uniq.size == node_names.size
146
+ @meta_data.each { |node| node_names << node['name'] }
147
+ node_names.compact!
148
+ node_names.uniq.size == node_names.size unless node_names.nil? || node_names.size < 1
102
149
  end
103
150
  end
104
151
  # return hash with all titles, h1 and h2
@@ -107,53 +154,18 @@ module SiteAnalyzer
107
154
  out = []
108
155
  out << @page.css('title').text << { @page_url => @page.css('h1').text }
109
156
  out << { @page_url => @page.css('h2').text }
157
+ out
110
158
  end
111
159
  end
112
-
113
- def home_a
114
- if @page
115
- home_a = []
116
- all_a_tags_href.uniq.each do |link|
117
- uri = URI(link.to_ascii) rescue nil #TODO: write additional logic for link to image
118
- if uri && @site_domain
119
- home_a << link if uri.host == @site_domain
120
- end
121
- end
122
- home_a
123
- end
124
- end
125
-
126
- def remote_a
127
- if @page
128
- remote_a = []
129
- all_a_tags_href.uniq.each do |link|
130
- uri = URI(link.to_ascii)
131
- if uri && @site_domain
132
- remote_a << link unless uri.host == @site_domain
133
- end
134
- end
135
- remote_a
136
- end
137
- end
138
-
139
- def all_a_tags_href
140
- if @page
141
- tags = []
142
- @page.css('a').each do |node|
143
- tags << node['href']
144
- end
145
- tags.compact
146
- end
147
- end
148
-
160
+ # check if page have h2 tags
149
161
  def h2?
150
162
  @page.css('h2').size > 0 if @page
151
163
  end
152
-
153
- def page_text_size
164
+ # return page size in symbols
165
+ def text_size
154
166
  @page.text.size if @page
155
167
  end
156
-
168
+ # get all a tags
157
169
  def all_a_tags
158
170
  if @page
159
171
  tags = []
@@ -163,15 +175,15 @@ module SiteAnalyzer
163
175
  tags.compact
164
176
  end
165
177
  end
166
-
167
- def all_titles
178
+ # return all page titles
179
+ def titles
168
180
  if @page
169
181
  titles = []
170
182
  @page.css('title').each { |tag| titles << tag.text }
171
183
  titles
172
184
  end
173
185
  end
174
-
186
+ # return all meta description content
175
187
  def all_meta_description_content
176
188
  if @page
177
189
  tags = []
@@ -181,7 +193,7 @@ module SiteAnalyzer
181
193
  tags
182
194
  end
183
195
  end
184
-
196
+ # return all h2 tags text
185
197
  def h2
186
198
  if @page
187
199
  h2s = []
@@ -189,9 +201,13 @@ module SiteAnalyzer
189
201
  h2s
190
202
  end
191
203
  end
192
-
204
+ # check url of page that is must be HLU
193
205
  def bad_url
194
206
  @page_url if @page_path.size > 1 unless @page_path =~ /^[\w.\-\/]+$/i
195
207
  end
208
+ # clear page from don't needed information
209
+ def clear!
210
+ @page = nil
211
+ end
196
212
  end
197
213
  end
@@ -11,7 +11,7 @@ module SiteAnalyzer
11
11
  @use_robot = use_robot
12
12
  @site = Site.new(@site_domain, @max_pages, @use_robot)
13
13
  end
14
-
14
+ # Entry point for gem. Create and show report. return array, show in console if select
15
15
  def self.create(options)
16
16
  options[:robot] = false if options[:robot] == 'false'
17
17
  options[:console] = false if options[:console] == 'false'
@@ -40,7 +40,7 @@ module SiteAnalyzer
40
40
 
41
41
  def to_s
42
42
  return 'Report is empty' if @report.nil? || @report.empty?
43
- header = Terminal::Table.new title: "Report for #{@site_domain} with #{@max_pages} pages max_pages and robot check is #{@use_robot}"
43
+ header = Terminal::Table.new title: "Report for #{@site_domain} with #{@max_pages} pages and robot.txt check is #{@use_robot}"
44
44
  puts header
45
45
  @report.each_pair do |key, value|
46
46
  rows = []
@@ -57,7 +57,7 @@ module SiteAnalyzer
57
57
  def check_titles_text_less_than_70
58
58
  result = []
59
59
  @site.pages.each do |page|
60
- result << page.page_url unless page.title_good?
60
+ result << page.page_url unless page.title_good
61
61
  end
62
62
  result
63
63
  end
@@ -65,7 +65,7 @@ module SiteAnalyzer
65
65
  def check_title_and_h1_for_doubles
66
66
  result = []
67
67
  @site.pages.each do |page|
68
- result << page.page_url unless page.title_and_h1_good?
68
+ result << page.page_url unless page.title_and_h1_good
69
69
  end
70
70
  result
71
71
  end
@@ -73,7 +73,7 @@ module SiteAnalyzer
73
73
  def check_meta_description
74
74
  result = []
75
75
  @site.pages.each do |page|
76
- result << page.page_url unless page.metadescription_good?
76
+ result << page.page_url unless page.meta_description_good
77
77
  end
78
78
  result
79
79
  end
@@ -81,7 +81,7 @@ module SiteAnalyzer
81
81
  def check_meta_keywords_tags
82
82
  result = []
83
83
  @site.pages.each do |page|
84
- result << page.page_url unless page.keywords_good?
84
+ result << page.page_url unless page.meta_keywords
85
85
  end
86
86
  result
87
87
  end
@@ -89,7 +89,7 @@ module SiteAnalyzer
89
89
  def check_h2
90
90
  result = []
91
91
  @site.pages.each do |page|
92
- result << page.page_url unless page.h2?
92
+ result << page.page_url unless page.have_h2
93
93
  end
94
94
  result
95
95
  end
@@ -105,7 +105,7 @@ module SiteAnalyzer
105
105
  def code_more
106
106
  result = []
107
107
  @site.pages.each do |page|
108
- result << page.page_url unless page.code_less?
108
+ result << page.page_url unless page.code_less
109
109
  end
110
110
  result
111
111
  end
@@ -16,7 +16,7 @@ module SiteAnalyzer
16
16
  @pages << Page.new(convert_to_valid(@main_url))
17
17
  scan_site!
18
18
  end
19
-
19
+ # check if page blocked by robot txt
20
20
  def robot_txt_allowed?(url)
21
21
  if @use_robot_txt
22
22
  Robotstxt.allowed?(url, '*') rescue nil
@@ -24,7 +24,7 @@ module SiteAnalyzer
24
24
  true
25
25
  end
26
26
  end
27
-
27
+ # scan pages: add page to scan, if still can scan do it, add new pages for scan from it and optimize massive of links
28
28
  def scan_site!
29
29
  add_pages_for_scan!
30
30
  while @pages_for_scan.size > 0
@@ -38,20 +38,20 @@ module SiteAnalyzer
38
38
  end
39
39
  end
40
40
  end
41
-
41
+ # add pages for scan array, also add bad pages to bad_pages array
42
42
  def add_pages_for_scan!
43
43
  @pages_for_scan = []
44
44
  @bad_pages = []
45
45
  @pages.each do |page|
46
- @bad_pages << page.page_url unless page.page
47
- if page.page
46
+ @bad_pages << page.page_url unless page.page_a_tags
47
+ if page.page_a_tags
48
48
  page.home_a.each do |link|
49
49
  @pages_for_scan << link
50
50
  end
51
51
  end
52
52
  end
53
53
  end
54
-
54
+ # create Page and add to to site
55
55
  def add_page(url)
56
56
  unless robot_txt_allowed?(url)
57
57
  @scanned_pages << url
@@ -61,42 +61,42 @@ module SiteAnalyzer
61
61
  @pages << page
62
62
  @scanned_pages << url
63
63
  end
64
-
64
+ # get all titles on site and return array of them
65
65
  def all_titles
66
66
  result = []
67
67
  @pages.each do |page|
68
- if page.page
69
- result << [page.page_url, page.titles]
68
+ if page.page_a_tags
69
+ result << [page.page_url, page.all_titles]
70
70
  end
71
71
  end
72
72
  result
73
73
  end
74
-
74
+ # get all meta description tags content and return it as array
75
75
  def all_descriptions
76
76
  result = []
77
77
  @pages.each do |page|
78
- if page.page
79
- result << [page.page_url, page.all_meta_description_content]
78
+ if page.page_a_tags
79
+ result << [page.page_url, page.meta_desc_content]
80
80
  end
81
81
  end
82
82
  result
83
83
  end
84
-
84
+ # get all h2 tags and return array of it
85
85
  def all_h2
86
86
  result = []
87
87
  @pages.each do |page|
88
- unless page.page
89
- result << [page.page_url, page.h2]
88
+ unless page.page_a_tags
89
+ result << [page.page_url, page.h2_text]
90
90
  end
91
91
  end
92
92
  result
93
93
  end
94
-
94
+ # get all a tags and return array of it
95
95
  def all_a
96
96
  result = []
97
97
  @pages.each do |page|
98
- if page.page
99
- page.all_a_tags.compact.each do |tag|
98
+ if page.page_a_tags
99
+ page.page_a_tags.compact.each do |tag|
100
100
  tag[0] = '-' unless tag[0]
101
101
  tag[1] = '-' unless tag[1]
102
102
  tag[2] = '-' unless tag[2]
@@ -106,29 +106,21 @@ module SiteAnalyzer
106
106
  end
107
107
  result.compact
108
108
  end
109
-
110
- def pages_url
111
- result = []
112
- @pages.each do |page|
113
- result << page.page_url if page.page
114
- end
115
- result
116
- end
117
-
109
+ # get all non HLU url and return array
118
110
  def bad_urls
119
111
  result = []
120
112
  @pages.each do |page|
121
- result << page.bad_url
113
+ result << page.hlu
122
114
  end
123
115
  result.compact!
124
116
  end
125
-
117
+ # get new array pages for scan and compact it
126
118
  def optimize_scan!
127
119
  @pages_for_scan = @pages_for_scan.compact.uniq
128
120
  @scanned_pages = @scanned_pages.compact.uniq
129
121
  @pages_for_scan = @pages_for_scan - @scanned_pages
130
122
  end
131
-
123
+ # check url and try to convert it to valid, remove .jpg links, add scheme to url
132
124
  def convert_to_valid(url)
133
125
  return nil if url =~ /.jpg$/i
134
126
  url.insert(0, @main_url.first(5)) if url.start_with? '//'
@@ -1,3 +1,3 @@
1
1
  module SiteAnalyzer
2
- VERSION = '0.3.14'
2
+ VERSION = '0.3.15'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: site_analyzer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Denis Savchuk