sitetap 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -10
- data/bin/sitetap +14 -0
- data/lib/sitetap/parser.rb +174 -0
- data/lib/sitetap/scraper.rb +63 -0
- data/lib/sitetap/version.rb +1 -1
- data/sitetap.gemspec +2 -0
- metadata +34 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
|
4
|
+
data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
|
7
|
+
data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484
|
data/README.md
CHANGED
@@ -1,10 +1,16 @@
|
|
1
|
-
|
1
|
+
SiteTap
|
2
|
+
==========
|
2
3
|
|
3
|
-
|
4
|
+
SiteTap takes a home page URL and turns into into a packaged directory of:
|
4
5
|
|
5
|
-
|
6
|
+
* html
|
7
|
+
* plain text
|
8
|
+
* markdown
|
6
9
|
|
7
|
-
|
10
|
+
Installation
|
11
|
+
----------
|
12
|
+
|
13
|
+
To install this to a ruby project, add the following to your `Gemfile`:
|
8
14
|
|
9
15
|
```ruby
|
10
16
|
gem 'sitetap'
|
@@ -12,17 +18,56 @@ gem 'sitetap'
|
|
12
18
|
|
13
19
|
And then execute:
|
14
20
|
|
15
|
-
|
21
|
+
```text
|
22
|
+
$ bundle install
|
23
|
+
```
|
24
|
+
|
25
|
+
Or install it so you can run it globally:
|
26
|
+
|
27
|
+
```text
|
28
|
+
$ gem install sitetap
|
29
|
+
```
|
30
|
+
|
31
|
+
Usage
|
32
|
+
----------
|
33
|
+
|
34
|
+
Using SiteTap is quite simple. You just run the executable and give it a URL.
|
35
|
+
|
36
|
+
```text
|
37
|
+
$ sitetap [URL]
|
38
|
+
```
|
39
|
+
|
40
|
+
So, if I wanted to scrape Sapwood's website, I could do this:
|
41
|
+
|
42
|
+
```text
|
43
|
+
$ sitetap "http://sapwood.org/"
|
44
|
+
```
|
45
|
+
|
46
|
+
Within your current directory, this will create the following directory
|
47
|
+
structure:
|
48
|
+
|
49
|
+
```text
|
50
|
+
- sapwood.org
|
51
|
+
- html
|
52
|
+
- markdown
|
53
|
+
- txt
|
54
|
+
- tmp
|
55
|
+
```
|
56
|
+
|
57
|
+
Within each are the converted files from the website.
|
16
58
|
|
17
|
-
|
59
|
+
Bugs
|
60
|
+
----------
|
18
61
|
|
19
|
-
|
62
|
+
Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
|
63
|
+
you encounter a bug.
|
20
64
|
|
21
|
-
|
65
|
+
Contributing
|
66
|
+
----------
|
22
67
|
|
23
|
-
|
68
|
+
Missing a feature? Add it!
|
24
69
|
|
25
|
-
|
70
|
+
Found a bug? Fix it!
|
26
71
|
|
27
72
|
1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
|
28
73
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
data/bin/sitetap
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'sitetap/scraper'
|
4
|
+
require 'sitetap/parser'
|
5
|
+
|
6
|
+
url = ARGV[0]
|
7
|
+
|
8
|
+
if url.nil? || url == ''
|
9
|
+
puts "Usage: sitetap [URL]"
|
10
|
+
exit
|
11
|
+
else
|
12
|
+
scraper = Sitetap::Scraper.scrape!(url)
|
13
|
+
parser = Sitetap::Parser.parse!(scraper.dir)
|
14
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'reverse_markdown'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
module Sitetap
|
6
|
+
class Parser
|
7
|
+
|
8
|
+
def initialize(root_dir)
|
9
|
+
@root = root_dir
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.parse!(root_dir)
|
13
|
+
parser = Sitetap::Parser.new(root_dir).parse!
|
14
|
+
parser
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse!
|
18
|
+
verify_directories
|
19
|
+
do_the_loop
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# ------------------------------------ References
|
26
|
+
|
27
|
+
def root
|
28
|
+
@root
|
29
|
+
end
|
30
|
+
|
31
|
+
def html_dir
|
32
|
+
@html_dir ||= "#{root}/html"
|
33
|
+
end
|
34
|
+
|
35
|
+
def tmp_dir
|
36
|
+
@tmp_dir ||= "#{root}/tmp"
|
37
|
+
end
|
38
|
+
|
39
|
+
def md_dir
|
40
|
+
@md_dir ||= "#{root}/markdown"
|
41
|
+
end
|
42
|
+
|
43
|
+
def txt_dir
|
44
|
+
@txt_dir ||= "#{root}/text"
|
45
|
+
end
|
46
|
+
|
47
|
+
def selector
|
48
|
+
@selector ||= "body"
|
49
|
+
end
|
50
|
+
|
51
|
+
def files
|
52
|
+
@files ||= Dir.glob("#{html_dir}/**/*.html")
|
53
|
+
end
|
54
|
+
|
55
|
+
# ------------------------------------ Directories
|
56
|
+
|
57
|
+
def mkdir_p(dir)
|
58
|
+
unless Dir.exists?(dir)
|
59
|
+
FileUtils.mkdir_p(dir)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def verify_directories
|
64
|
+
[tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
|
65
|
+
end
|
66
|
+
|
67
|
+
def verify_file_directories(files)
|
68
|
+
files.each do |file|
|
69
|
+
dir = file.split('/')[0..-2].join('/')
|
70
|
+
mkdir_p(dir)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# ------------------------------------ The Loop
|
75
|
+
|
76
|
+
def do_the_loop
|
77
|
+
files.each do |file|
|
78
|
+
|
79
|
+
# get the path of the file relative to the html
|
80
|
+
# directory (scraped dir)
|
81
|
+
#
|
82
|
+
file_path = file.gsub(/#{html_dir}\//, '')
|
83
|
+
|
84
|
+
# clean the contents of the html file so we can work
|
85
|
+
# with it
|
86
|
+
#
|
87
|
+
contents = clean_html(file)
|
88
|
+
|
89
|
+
# set the references to where the new files will
|
90
|
+
# live
|
91
|
+
#
|
92
|
+
tmp_file_path = "#{tmp_dir}/#{file_path}"
|
93
|
+
markdown_file_path = "#{md_dir}/#{file_path}.md"
|
94
|
+
text_file_path = "#{txt_dir}/#{file_path}.txt"
|
95
|
+
|
96
|
+
# find or create directories that will contain the
|
97
|
+
# file
|
98
|
+
#
|
99
|
+
verify_file_directories([
|
100
|
+
tmp_file_path,
|
101
|
+
markdown_file_path,
|
102
|
+
text_file_path
|
103
|
+
])
|
104
|
+
|
105
|
+
# write a temporary html file with the cleaned-up
|
106
|
+
# contents
|
107
|
+
#
|
108
|
+
write_file(tmp_file_path, contents)
|
109
|
+
|
110
|
+
# now we hone in on the html contents and strip the
|
111
|
+
# stuff we don't need
|
112
|
+
#
|
113
|
+
adj_contents = filter_html(tmp_file_path)
|
114
|
+
|
115
|
+
# convert the adjusted html to markdown and write it
|
116
|
+
# to file
|
117
|
+
#
|
118
|
+
write_file(markdown_file_path, html2markdown(adj_contents))
|
119
|
+
|
120
|
+
# last, we remove all the tags and write the plain
|
121
|
+
# text file
|
122
|
+
#
|
123
|
+
write_file(text_file_path, strip_tags(adj_contents))
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# ------------------------------------ Parsing Actions
|
129
|
+
|
130
|
+
def clean_html(file)
|
131
|
+
File.read(file)
|
132
|
+
.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
133
|
+
.split(' ')
|
134
|
+
.to_s
|
135
|
+
.gsub(/\\u0000/, '')
|
136
|
+
.split('", "')
|
137
|
+
.join(' ')
|
138
|
+
.gsub(/\\/, '')
|
139
|
+
.gsub(/\"\]/, '')
|
140
|
+
.gsub(/\[\"/, '')
|
141
|
+
.gsub(/[”“]/, '"')
|
142
|
+
.gsub(/[’]/, "'")
|
143
|
+
.gsub(/[é]/, 'e')
|
144
|
+
.gsub(/[–]/, '-')
|
145
|
+
end
|
146
|
+
|
147
|
+
def filter_html(file_path)
|
148
|
+
contents = File.read(file_path, :encoding => 'ASCII')
|
149
|
+
page = Nokogiri::HTML(contents)
|
150
|
+
content = page.css(selector).to_s
|
151
|
+
# content = page.css('body').to_s if content == ''
|
152
|
+
end
|
153
|
+
|
154
|
+
def strip_tags(html)
|
155
|
+
html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
|
156
|
+
html.gsub(/(\ \ )+/, "\n\n")
|
157
|
+
end
|
158
|
+
|
159
|
+
def html2markdown(html)
|
160
|
+
ReverseMarkdown.convert(
|
161
|
+
html,
|
162
|
+
:unknown_tags => :bypass,
|
163
|
+
:github_flavored => true
|
164
|
+
)
|
165
|
+
end
|
166
|
+
|
167
|
+
# ------------------------------------ Writing Files
|
168
|
+
|
169
|
+
def write_file(file_path, content)
|
170
|
+
File.open(file_path, 'w') { |file| file.write(content) }
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module Sitetap
|
4
|
+
class Scraper
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url.strip.gsub(/\/$/, '')
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.scrape!(url)
|
11
|
+
scraper = Sitetap::Scraper.new(url)
|
12
|
+
scraper.scrape!
|
13
|
+
scraper
|
14
|
+
end
|
15
|
+
|
16
|
+
def scrape!
|
17
|
+
verify_dir
|
18
|
+
wget
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def dir
|
23
|
+
root
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def domain
|
29
|
+
@domain ||= @url.gsub(/http(s)?\:\/\//, '')
|
30
|
+
end
|
31
|
+
|
32
|
+
def root
|
33
|
+
@root ||= "#{Dir.pwd}/#{domain}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def html_dir
|
37
|
+
"#{root}/html"
|
38
|
+
end
|
39
|
+
|
40
|
+
def verify_dir
|
41
|
+
unless Dir.exists?(html_dir)
|
42
|
+
FileUtils.mkdir_p(html_dir)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def wget_options
|
47
|
+
[
|
48
|
+
'--recursive',
|
49
|
+
'--page-requisites',
|
50
|
+
'--html-extension',
|
51
|
+
'--convert-links',
|
52
|
+
'--restrict-file-names=windows',
|
53
|
+
'--span-hosts'
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
def wget
|
58
|
+
system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
|
59
|
+
# add `-o #{log_dir}/scrape.log` to store logfile
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
data/lib/sitetap/version.rb
CHANGED
data/sitetap.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitetap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sean C Davis
|
@@ -38,10 +38,39 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: reverse_markdown
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
41
69
|
description: ''
|
42
70
|
email:
|
43
71
|
- scdavis41@gmail.com
|
44
|
-
executables:
|
72
|
+
executables:
|
73
|
+
- sitetap
|
45
74
|
extensions: []
|
46
75
|
extra_rdoc_files: []
|
47
76
|
files:
|
@@ -50,7 +79,10 @@ files:
|
|
50
79
|
- LICENSE.txt
|
51
80
|
- README.md
|
52
81
|
- Rakefile
|
82
|
+
- bin/sitetap
|
53
83
|
- lib/sitetap.rb
|
84
|
+
- lib/sitetap/parser.rb
|
85
|
+
- lib/sitetap/scraper.rb
|
54
86
|
- lib/sitetap/version.rb
|
55
87
|
- sitetap.gemspec
|
56
88
|
homepage: ''
|