sitetap 0.0.0 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +55 -10
- data/bin/sitetap +14 -0
- data/lib/sitetap/parser.rb +174 -0
- data/lib/sitetap/scraper.rb +63 -0
- data/lib/sitetap/version.rb +1 -1
- data/sitetap.gemspec +2 -0
- metadata +34 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
|
4
|
+
data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
|
7
|
+
data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484
|
data/README.md
CHANGED
@@ -1,10 +1,16 @@
|
|
1
|
-
|
1
|
+
SiteTap
|
2
|
+
==========
|
2
3
|
|
3
|
-
|
4
|
+
SiteTap takes a home page URL and turns into into a packaged directory of:
|
4
5
|
|
5
|
-
|
6
|
+
* html
|
7
|
+
* plain text
|
8
|
+
* markdown
|
6
9
|
|
7
|
-
|
10
|
+
Installation
|
11
|
+
----------
|
12
|
+
|
13
|
+
To install this to a ruby project, add the following to your `Gemfile`:
|
8
14
|
|
9
15
|
```ruby
|
10
16
|
gem 'sitetap'
|
@@ -12,17 +18,56 @@ gem 'sitetap'
|
|
12
18
|
|
13
19
|
And then execute:
|
14
20
|
|
15
|
-
|
21
|
+
```text
|
22
|
+
$ bundle install
|
23
|
+
```
|
24
|
+
|
25
|
+
Or install it so you can run it globally:
|
26
|
+
|
27
|
+
```text
|
28
|
+
$ gem install sitetap
|
29
|
+
```
|
30
|
+
|
31
|
+
Usage
|
32
|
+
----------
|
33
|
+
|
34
|
+
Using SiteTap is quite simple. You just run the executable and give it a URL.
|
35
|
+
|
36
|
+
```text
|
37
|
+
$ sitetap [URL]
|
38
|
+
```
|
39
|
+
|
40
|
+
So, if I wanted to scrape Sapwood's website, I could do this:
|
41
|
+
|
42
|
+
```text
|
43
|
+
$ sitetap "http://sapwood.org/"
|
44
|
+
```
|
45
|
+
|
46
|
+
Within your current directory, this will create the following directory
|
47
|
+
structure:
|
48
|
+
|
49
|
+
```text
|
50
|
+
- sapwood.org
|
51
|
+
- html
|
52
|
+
- markdown
|
53
|
+
- txt
|
54
|
+
- tmp
|
55
|
+
```
|
56
|
+
|
57
|
+
Within each are the converted files from the website.
|
16
58
|
|
17
|
-
|
59
|
+
Bugs
|
60
|
+
----------
|
18
61
|
|
19
|
-
|
62
|
+
Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
|
63
|
+
you encounter a bug.
|
20
64
|
|
21
|
-
|
65
|
+
Contributing
|
66
|
+
----------
|
22
67
|
|
23
|
-
|
68
|
+
Missing a feature? Add it!
|
24
69
|
|
25
|
-
|
70
|
+
Found a bug? Fix it!
|
26
71
|
|
27
72
|
1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
|
28
73
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
data/bin/sitetap
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'sitetap/scraper'
|
4
|
+
require 'sitetap/parser'
|
5
|
+
|
6
|
+
url = ARGV[0]
|
7
|
+
|
8
|
+
if url.nil? || url == ''
|
9
|
+
puts "Usage: sitetap [URL]"
|
10
|
+
exit
|
11
|
+
else
|
12
|
+
scraper = Sitetap::Scraper.scrape!(url)
|
13
|
+
parser = Sitetap::Parser.parse!(scraper.dir)
|
14
|
+
end
|
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'reverse_markdown'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
module Sitetap
|
6
|
+
class Parser
|
7
|
+
|
8
|
+
def initialize(root_dir)
|
9
|
+
@root = root_dir
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.parse!(root_dir)
|
13
|
+
parser = Sitetap::Parser.new(root_dir).parse!
|
14
|
+
parser
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse!
|
18
|
+
verify_directories
|
19
|
+
do_the_loop
|
20
|
+
self
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
# ------------------------------------ References
|
26
|
+
|
27
|
+
def root
|
28
|
+
@root
|
29
|
+
end
|
30
|
+
|
31
|
+
def html_dir
|
32
|
+
@html_dir ||= "#{root}/html"
|
33
|
+
end
|
34
|
+
|
35
|
+
def tmp_dir
|
36
|
+
@tmp_dir ||= "#{root}/tmp"
|
37
|
+
end
|
38
|
+
|
39
|
+
def md_dir
|
40
|
+
@md_dir ||= "#{root}/markdown"
|
41
|
+
end
|
42
|
+
|
43
|
+
def txt_dir
|
44
|
+
@txt_dir ||= "#{root}/text"
|
45
|
+
end
|
46
|
+
|
47
|
+
def selector
|
48
|
+
@selector ||= "body"
|
49
|
+
end
|
50
|
+
|
51
|
+
def files
|
52
|
+
@files ||= Dir.glob("#{html_dir}/**/*.html")
|
53
|
+
end
|
54
|
+
|
55
|
+
# ------------------------------------ Directories
|
56
|
+
|
57
|
+
def mkdir_p(dir)
|
58
|
+
unless Dir.exists?(dir)
|
59
|
+
FileUtils.mkdir_p(dir)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def verify_directories
|
64
|
+
[tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
|
65
|
+
end
|
66
|
+
|
67
|
+
def verify_file_directories(files)
|
68
|
+
files.each do |file|
|
69
|
+
dir = file.split('/')[0..-2].join('/')
|
70
|
+
mkdir_p(dir)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# ------------------------------------ The Loop
|
75
|
+
|
76
|
+
def do_the_loop
|
77
|
+
files.each do |file|
|
78
|
+
|
79
|
+
# get the path of the file relative to the html
|
80
|
+
# directory (scraped dir)
|
81
|
+
#
|
82
|
+
file_path = file.gsub(/#{html_dir}\//, '')
|
83
|
+
|
84
|
+
# clean the contents of the html file so we can work
|
85
|
+
# with it
|
86
|
+
#
|
87
|
+
contents = clean_html(file)
|
88
|
+
|
89
|
+
# set the references to where the new files will
|
90
|
+
# live
|
91
|
+
#
|
92
|
+
tmp_file_path = "#{tmp_dir}/#{file_path}"
|
93
|
+
markdown_file_path = "#{md_dir}/#{file_path}.md"
|
94
|
+
text_file_path = "#{txt_dir}/#{file_path}.txt"
|
95
|
+
|
96
|
+
# find or create directories that will contain the
|
97
|
+
# file
|
98
|
+
#
|
99
|
+
verify_file_directories([
|
100
|
+
tmp_file_path,
|
101
|
+
markdown_file_path,
|
102
|
+
text_file_path
|
103
|
+
])
|
104
|
+
|
105
|
+
# write a temporary html file with the cleaned-up
|
106
|
+
# contents
|
107
|
+
#
|
108
|
+
write_file(tmp_file_path, contents)
|
109
|
+
|
110
|
+
# now we hone in on the html contents and strip the
|
111
|
+
# stuff we don't need
|
112
|
+
#
|
113
|
+
adj_contents = filter_html(tmp_file_path)
|
114
|
+
|
115
|
+
# convert the adjusted html to markdown and write it
|
116
|
+
# to file
|
117
|
+
#
|
118
|
+
write_file(markdown_file_path, html2markdown(adj_contents))
|
119
|
+
|
120
|
+
# last, we remove all the tags and write the plain
|
121
|
+
# text file
|
122
|
+
#
|
123
|
+
write_file(text_file_path, strip_tags(adj_contents))
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# ------------------------------------ Parsing Actions
|
129
|
+
|
130
|
+
def clean_html(file)
|
131
|
+
File.read(file)
|
132
|
+
.encode('UTF-8', :invalid => :replace, :undef => :replace)
|
133
|
+
.split(' ')
|
134
|
+
.to_s
|
135
|
+
.gsub(/\\u0000/, '')
|
136
|
+
.split('", "')
|
137
|
+
.join(' ')
|
138
|
+
.gsub(/\\/, '')
|
139
|
+
.gsub(/\"\]/, '')
|
140
|
+
.gsub(/\[\"/, '')
|
141
|
+
.gsub(/[”“]/, '"')
|
142
|
+
.gsub(/[’]/, "'")
|
143
|
+
.gsub(/[é]/, 'e')
|
144
|
+
.gsub(/[–]/, '-')
|
145
|
+
end
|
146
|
+
|
147
|
+
def filter_html(file_path)
|
148
|
+
contents = File.read(file_path, :encoding => 'ASCII')
|
149
|
+
page = Nokogiri::HTML(contents)
|
150
|
+
content = page.css(selector).to_s
|
151
|
+
# content = page.css('body').to_s if content == ''
|
152
|
+
end
|
153
|
+
|
154
|
+
def strip_tags(html)
|
155
|
+
html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
|
156
|
+
html.gsub(/(\ \ )+/, "\n\n")
|
157
|
+
end
|
158
|
+
|
159
|
+
def html2markdown(html)
|
160
|
+
ReverseMarkdown.convert(
|
161
|
+
html,
|
162
|
+
:unknown_tags => :bypass,
|
163
|
+
:github_flavored => true
|
164
|
+
)
|
165
|
+
end
|
166
|
+
|
167
|
+
# ------------------------------------ Writing Files
|
168
|
+
|
169
|
+
def write_file(file_path, content)
|
170
|
+
File.open(file_path, 'w') { |file| file.write(content) }
|
171
|
+
end
|
172
|
+
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module Sitetap
|
4
|
+
class Scraper
|
5
|
+
|
6
|
+
def initialize(url)
|
7
|
+
@url = url.strip.gsub(/\/$/, '')
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.scrape!(url)
|
11
|
+
scraper = Sitetap::Scraper.new(url)
|
12
|
+
scraper.scrape!
|
13
|
+
scraper
|
14
|
+
end
|
15
|
+
|
16
|
+
def scrape!
|
17
|
+
verify_dir
|
18
|
+
wget
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def dir
|
23
|
+
root
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def domain
|
29
|
+
@domain ||= @url.gsub(/http(s)?\:\/\//, '')
|
30
|
+
end
|
31
|
+
|
32
|
+
def root
|
33
|
+
@root ||= "#{Dir.pwd}/#{domain}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def html_dir
|
37
|
+
"#{root}/html"
|
38
|
+
end
|
39
|
+
|
40
|
+
def verify_dir
|
41
|
+
unless Dir.exists?(html_dir)
|
42
|
+
FileUtils.mkdir_p(html_dir)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def wget_options
|
47
|
+
[
|
48
|
+
'--recursive',
|
49
|
+
'--page-requisites',
|
50
|
+
'--html-extension',
|
51
|
+
'--convert-links',
|
52
|
+
'--restrict-file-names=windows',
|
53
|
+
'--span-hosts'
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
def wget
|
58
|
+
system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
|
59
|
+
# add `-o #{log_dir}/scrape.log` to store logfile
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
data/lib/sitetap/version.rb
CHANGED
data/sitetap.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sitetap
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sean C Davis
|
@@ -38,10 +38,39 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: reverse_markdown
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
41
69
|
description: ''
|
42
70
|
email:
|
43
71
|
- scdavis41@gmail.com
|
44
|
-
executables:
|
72
|
+
executables:
|
73
|
+
- sitetap
|
45
74
|
extensions: []
|
46
75
|
extra_rdoc_files: []
|
47
76
|
files:
|
@@ -50,7 +79,10 @@ files:
|
|
50
79
|
- LICENSE.txt
|
51
80
|
- README.md
|
52
81
|
- Rakefile
|
82
|
+
- bin/sitetap
|
53
83
|
- lib/sitetap.rb
|
84
|
+
- lib/sitetap/parser.rb
|
85
|
+
- lib/sitetap/scraper.rb
|
54
86
|
- lib/sitetap/version.rb
|
55
87
|
- sitetap.gemspec
|
56
88
|
homepage: ''
|