sitetap 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/sitetap +3 -2
- data/lib/sitetap/parser.rb +19 -18
- data/lib/sitetap/version.rb +1 -1
- data/sitetap.gemspec +1 -0
- metadata +17 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 54e85a540d37ca649e921379f6c45b4142713339
|
|
4
|
+
data.tar.gz: 4f1e3f07367b37a1fb21ff14ecb0780f4f4e962f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f2cc1b00ec4b37fc26facbf3c27c59595c6fdc5de8502424186fc320d985f550c28f986da30fbddce40c6891795ed51cfc56edade094764402de93f2fd743ee8
|
|
7
|
+
data.tar.gz: 0294b1b4d7dc0e458b3a06fac61ef5623d74b395c55a7e2c99209c765113987f193d662712561d47ea4729b76d887e1bc2d10c239299456a1783c477d02c146b
|
data/bin/sitetap
CHANGED
|
@@ -3,12 +3,13 @@
|
|
|
3
3
|
require 'sitetap/scraper'
|
|
4
4
|
require 'sitetap/parser'
|
|
5
5
|
|
|
6
|
-
url
|
|
6
|
+
url = ARGV[0]
|
|
7
|
+
selector = ARGV[1]
|
|
7
8
|
|
|
8
9
|
if url.nil? || url == ''
|
|
9
10
|
puts "Usage: sitetap [URL]"
|
|
10
11
|
exit
|
|
11
12
|
else
|
|
12
13
|
scraper = Sitetap::Scraper.scrape!(url)
|
|
13
|
-
parser = Sitetap::Parser.parse!(scraper.dir)
|
|
14
|
+
parser = Sitetap::Parser.parse!(scraper.dir, selector)
|
|
14
15
|
end
|
data/lib/sitetap/parser.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
require 'nokogiri'
|
|
2
2
|
require 'reverse_markdown'
|
|
3
3
|
require 'fileutils'
|
|
4
|
+
require 'sanitize'
|
|
4
5
|
|
|
5
6
|
module Sitetap
|
|
6
7
|
class Parser
|
|
@@ -9,12 +10,13 @@ module Sitetap
|
|
|
9
10
|
@root = root_dir
|
|
10
11
|
end
|
|
11
12
|
|
|
12
|
-
def self.parse!(root_dir)
|
|
13
|
-
parser = Sitetap::Parser.new(root_dir).parse!
|
|
13
|
+
def self.parse!(root_dir, selector = nil)
|
|
14
|
+
parser = Sitetap::Parser.new(root_dir).parse!(selector)
|
|
14
15
|
parser
|
|
15
16
|
end
|
|
16
17
|
|
|
17
|
-
def parse!
|
|
18
|
+
def parse!(selector = nil)
|
|
19
|
+
@selector = selector unless selector.nil?
|
|
18
20
|
verify_directories
|
|
19
21
|
do_the_loop
|
|
20
22
|
self
|
|
@@ -78,24 +80,24 @@ module Sitetap
|
|
|
78
80
|
|
|
79
81
|
# get the path of the file relative to the html
|
|
80
82
|
# directory (scraped dir)
|
|
81
|
-
#
|
|
83
|
+
#
|
|
82
84
|
file_path = file.gsub(/#{html_dir}\//, '')
|
|
83
85
|
|
|
84
86
|
# clean the contents of the html file so we can work
|
|
85
87
|
# with it
|
|
86
|
-
#
|
|
88
|
+
#
|
|
87
89
|
contents = clean_html(file)
|
|
88
90
|
|
|
89
91
|
# set the references to where the new files will
|
|
90
92
|
# live
|
|
91
|
-
#
|
|
93
|
+
#
|
|
92
94
|
tmp_file_path = "#{tmp_dir}/#{file_path}"
|
|
93
95
|
markdown_file_path = "#{md_dir}/#{file_path}.md"
|
|
94
96
|
text_file_path = "#{txt_dir}/#{file_path}.txt"
|
|
95
97
|
|
|
96
98
|
# find or create directories that will contain the
|
|
97
99
|
# file
|
|
98
|
-
#
|
|
100
|
+
#
|
|
99
101
|
verify_file_directories([
|
|
100
102
|
tmp_file_path,
|
|
101
103
|
markdown_file_path,
|
|
@@ -104,22 +106,22 @@ module Sitetap
|
|
|
104
106
|
|
|
105
107
|
# write a temporary html file with the cleaned-up
|
|
106
108
|
# contents
|
|
107
|
-
#
|
|
109
|
+
#
|
|
108
110
|
write_file(tmp_file_path, contents)
|
|
109
111
|
|
|
110
112
|
# now we hone in on the html contents and strip the
|
|
111
113
|
# stuff we don't need
|
|
112
|
-
#
|
|
114
|
+
#
|
|
113
115
|
adj_contents = filter_html(tmp_file_path)
|
|
114
116
|
|
|
115
117
|
# convert the adjusted html to markdown and write it
|
|
116
118
|
# to file
|
|
117
|
-
#
|
|
119
|
+
#
|
|
118
120
|
write_file(markdown_file_path, html2markdown(adj_contents))
|
|
119
121
|
|
|
120
122
|
# last, we remove all the tags and write the plain
|
|
121
123
|
# text file
|
|
122
|
-
#
|
|
124
|
+
#
|
|
123
125
|
write_file(text_file_path, strip_tags(adj_contents))
|
|
124
126
|
|
|
125
127
|
end
|
|
@@ -145,23 +147,22 @@ module Sitetap
|
|
|
145
147
|
end
|
|
146
148
|
|
|
147
149
|
def filter_html(file_path)
|
|
148
|
-
contents = File.read(file_path, :encoding => '
|
|
150
|
+
contents = File.read(file_path, :encoding => 'UTF-8')
|
|
149
151
|
page = Nokogiri::HTML(contents)
|
|
150
152
|
content = page.css(selector).to_s
|
|
151
|
-
# content = page.css('body').to_s if content == ''
|
|
152
153
|
end
|
|
153
154
|
|
|
154
155
|
def strip_tags(html)
|
|
155
|
-
html =
|
|
156
|
-
html.gsub(
|
|
156
|
+
html = Sanitize.fragment(html)
|
|
157
|
+
html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
|
|
157
158
|
end
|
|
158
159
|
|
|
159
160
|
def html2markdown(html)
|
|
160
161
|
ReverseMarkdown.convert(
|
|
161
|
-
html,
|
|
162
|
-
:unknown_tags => :bypass,
|
|
162
|
+
html,
|
|
163
|
+
:unknown_tags => :bypass,
|
|
163
164
|
:github_flavored => true
|
|
164
|
-
)
|
|
165
|
+
).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
|
|
165
166
|
end
|
|
166
167
|
|
|
167
168
|
# ------------------------------------ Writing Files
|
data/lib/sitetap/version.rb
CHANGED
data/sitetap.gemspec
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: sitetap
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sean C Davis
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-
|
|
11
|
+
date: 2015-11-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -66,6 +66,20 @@ dependencies:
|
|
|
66
66
|
- - ">="
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: sanitize
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ">="
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0'
|
|
76
|
+
type: :runtime
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ">="
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0'
|
|
69
83
|
description: ''
|
|
70
84
|
email:
|
|
71
85
|
- scdavis41@gmail.com
|
|
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
105
119
|
version: '0'
|
|
106
120
|
requirements: []
|
|
107
121
|
rubyforge_project:
|
|
108
|
-
rubygems_version: 2.
|
|
122
|
+
rubygems_version: 2.4.6
|
|
109
123
|
signing_key:
|
|
110
124
|
specification_version: 4
|
|
111
125
|
summary: Scrape content from a website.
|