sitetap 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/sitetap +3 -2
- data/lib/sitetap/parser.rb +19 -18
- data/lib/sitetap/version.rb +1 -1
- data/sitetap.gemspec +1 -0
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54e85a540d37ca649e921379f6c45b4142713339
|
4
|
+
data.tar.gz: 4f1e3f07367b37a1fb21ff14ecb0780f4f4e962f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f2cc1b00ec4b37fc26facbf3c27c59595c6fdc5de8502424186fc320d985f550c28f986da30fbddce40c6891795ed51cfc56edade094764402de93f2fd743ee8
|
7
|
+
data.tar.gz: 0294b1b4d7dc0e458b3a06fac61ef5623d74b395c55a7e2c99209c765113987f193d662712561d47ea4729b76d887e1bc2d10c239299456a1783c477d02c146b
|
data/bin/sitetap
CHANGED
@@ -3,12 +3,13 @@
|
|
3
3
|
require 'sitetap/scraper'
|
4
4
|
require 'sitetap/parser'
|
5
5
|
|
6
|
-
url
|
6
|
+
url = ARGV[0]
|
7
|
+
selector = ARGV[1]
|
7
8
|
|
8
9
|
if url.nil? || url == ''
|
9
10
|
puts "Usage: sitetap [URL]"
|
10
11
|
exit
|
11
12
|
else
|
12
13
|
scraper = Sitetap::Scraper.scrape!(url)
|
13
|
-
parser = Sitetap::Parser.parse!(scraper.dir)
|
14
|
+
parser = Sitetap::Parser.parse!(scraper.dir, selector)
|
14
15
|
end
|
data/lib/sitetap/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'reverse_markdown'
|
3
3
|
require 'fileutils'
|
4
|
+
require 'sanitize'
|
4
5
|
|
5
6
|
module Sitetap
|
6
7
|
class Parser
|
@@ -9,12 +10,13 @@ module Sitetap
|
|
9
10
|
@root = root_dir
|
10
11
|
end
|
11
12
|
|
12
|
-
def self.parse!(root_dir)
|
13
|
-
parser = Sitetap::Parser.new(root_dir).parse!
|
13
|
+
def self.parse!(root_dir, selector = nil)
|
14
|
+
parser = Sitetap::Parser.new(root_dir).parse!(selector)
|
14
15
|
parser
|
15
16
|
end
|
16
17
|
|
17
|
-
def parse!
|
18
|
+
def parse!(selector = nil)
|
19
|
+
@selector = selector unless selector.nil?
|
18
20
|
verify_directories
|
19
21
|
do_the_loop
|
20
22
|
self
|
@@ -78,24 +80,24 @@ module Sitetap
|
|
78
80
|
|
79
81
|
# get the path of the file relative to the html
|
80
82
|
# directory (scraped dir)
|
81
|
-
#
|
83
|
+
#
|
82
84
|
file_path = file.gsub(/#{html_dir}\//, '')
|
83
85
|
|
84
86
|
# clean the contents of the html file so we can work
|
85
87
|
# with it
|
86
|
-
#
|
88
|
+
#
|
87
89
|
contents = clean_html(file)
|
88
90
|
|
89
91
|
# set the references to where the new files will
|
90
92
|
# live
|
91
|
-
#
|
93
|
+
#
|
92
94
|
tmp_file_path = "#{tmp_dir}/#{file_path}"
|
93
95
|
markdown_file_path = "#{md_dir}/#{file_path}.md"
|
94
96
|
text_file_path = "#{txt_dir}/#{file_path}.txt"
|
95
97
|
|
96
98
|
# find or create directories that will contain the
|
97
99
|
# file
|
98
|
-
#
|
100
|
+
#
|
99
101
|
verify_file_directories([
|
100
102
|
tmp_file_path,
|
101
103
|
markdown_file_path,
|
@@ -104,22 +106,22 @@ module Sitetap
|
|
104
106
|
|
105
107
|
# write a temporary html file with the cleaned-up
|
106
108
|
# contents
|
107
|
-
#
|
109
|
+
#
|
108
110
|
write_file(tmp_file_path, contents)
|
109
111
|
|
110
112
|
# now we hone in on the html contents and strip the
|
111
113
|
# stuff we don't need
|
112
|
-
#
|
114
|
+
#
|
113
115
|
adj_contents = filter_html(tmp_file_path)
|
114
116
|
|
115
117
|
# convert the adjusted html to markdown and write it
|
116
118
|
# to file
|
117
|
-
#
|
119
|
+
#
|
118
120
|
write_file(markdown_file_path, html2markdown(adj_contents))
|
119
121
|
|
120
122
|
# last, we remove all the tags and write the plain
|
121
123
|
# text file
|
122
|
-
#
|
124
|
+
#
|
123
125
|
write_file(text_file_path, strip_tags(adj_contents))
|
124
126
|
|
125
127
|
end
|
@@ -145,23 +147,22 @@ module Sitetap
|
|
145
147
|
end
|
146
148
|
|
147
149
|
def filter_html(file_path)
|
148
|
-
contents = File.read(file_path, :encoding => '
|
150
|
+
contents = File.read(file_path, :encoding => 'UTF-8')
|
149
151
|
page = Nokogiri::HTML(contents)
|
150
152
|
content = page.css(selector).to_s
|
151
|
-
# content = page.css('body').to_s if content == ''
|
152
153
|
end
|
153
154
|
|
154
155
|
def strip_tags(html)
|
155
|
-
html =
|
156
|
-
html.gsub(
|
156
|
+
html = Sanitize.fragment(html)
|
157
|
+
html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
|
157
158
|
end
|
158
159
|
|
159
160
|
def html2markdown(html)
|
160
161
|
ReverseMarkdown.convert(
|
161
|
-
html,
|
162
|
-
:unknown_tags => :bypass,
|
162
|
+
html,
|
163
|
+
:unknown_tags => :bypass,
|
163
164
|
:github_flavored => true
|
164
|
-
)
|
165
|
+
).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
|
165
166
|
end
|
166
167
|
|
167
168
|
# ------------------------------------ Writing Files
|
data/lib/sitetap/version.rb
CHANGED
data/sitetap.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitetap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sean C Davis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: sanitize
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
description: ''
|
70
84
|
email:
|
71
85
|
- scdavis41@gmail.com
|
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
105
119
|
version: '0'
|
106
120
|
requirements: []
|
107
121
|
rubyforge_project:
|
108
|
-
rubygems_version: 2.
|
122
|
+
rubygems_version: 2.4.6
|
109
123
|
signing_key:
|
110
124
|
specification_version: 4
|
111
125
|
summary: Scrape content from a website.
|