sitetap 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 54aa226490e4ea05843234b47534780ce0514413
4
- data.tar.gz: 6a2354d47a8fa7ebc1a4838e8ac3b394f6556abd
3
+ metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
4
+ data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
5
5
  SHA512:
6
- metadata.gz: 01ce3d3194cf12fb9c66d30e3c1a3010bf80e41bd6f68725e6d98cce87bef32277d15a3b2e43c53a0f69c9b56393a4aab85ff9d74a28a6b3e25da89ec693e939
7
- data.tar.gz: 07dd9f5e9e0653f06465bedbccf80242d442aac615c47746fd11103902ba0f206daf38f14257d1f8d5882b92deaab8d3e70b7e6e82c2cc42bdaef1f66cd8f017
6
+ metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
7
+ data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484
data/README.md CHANGED
@@ -1,10 +1,16 @@
1
- # Sitetap
1
+ SiteTap
2
+ ==========
2
3
 
3
- TODO: Write a gem description
4
+ SiteTap takes a home page URL and turns into into a packaged directory of:
4
5
 
5
- ## Installation
6
+ * html
7
+ * plain text
8
+ * markdown
6
9
 
7
- Add this line to your application's Gemfile:
10
+ Installation
11
+ ----------
12
+
13
+ To install this to a ruby project, add the following to your `Gemfile`:
8
14
 
9
15
  ```ruby
10
16
  gem 'sitetap'
@@ -12,17 +18,56 @@ gem 'sitetap'
12
18
 
13
19
  And then execute:
14
20
 
15
- $ bundle
21
+ ```text
22
+ $ bundle install
23
+ ```
24
+
25
+ Or install it so you can run it globally:
26
+
27
+ ```text
28
+ $ gem install sitetap
29
+ ```
30
+
31
+ Usage
32
+ ----------
33
+
34
+ Using SiteTap is quite simple. You just run the executable and give it a URL.
35
+
36
+ ```text
37
+ $ sitetap [URL]
38
+ ```
39
+
40
+ So, if I wanted to scrape Sapwood's website, I could do this:
41
+
42
+ ```text
43
+ $ sitetap "http://sapwood.org/"
44
+ ```
45
+
46
+ Within your current directory, this will create the following directory
47
+ structure:
48
+
49
+ ```text
50
+ - sapwood.org
51
+ - html
52
+ - markdown
53
+ - txt
54
+ - tmp
55
+ ```
56
+
57
+ Within each are the converted files from the website.
16
58
 
17
- Or install it yourself as:
59
+ Bugs
60
+ ----------
18
61
 
19
- $ gem install sitetap
62
+ Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
63
+ you encounter a bug.
20
64
 
21
- ## Usage
65
+ Contributing
66
+ ----------
22
67
 
23
- TODO: Write usage instructions here
68
+ Missing a feature? Add it!
24
69
 
25
- ## Contributing
70
+ Found a bug? Fix it!
26
71
 
27
72
  1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
28
73
  2. Create your feature branch (`git checkout -b my-new-feature`)
data/bin/sitetap ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'sitetap/scraper'
4
+ require 'sitetap/parser'
5
+
6
+ url = ARGV[0]
7
+
8
+ if url.nil? || url == ''
9
+ puts "Usage: sitetap [URL]"
10
+ exit
11
+ else
12
+ scraper = Sitetap::Scraper.scrape!(url)
13
+ parser = Sitetap::Parser.parse!(scraper.dir)
14
+ end
@@ -0,0 +1,174 @@
1
+ require 'nokogiri'
2
+ require 'reverse_markdown'
3
+ require 'fileutils'
4
+
5
+ module Sitetap
6
+ class Parser
7
+
8
+ def initialize(root_dir)
9
+ @root = root_dir
10
+ end
11
+
12
+ def self.parse!(root_dir)
13
+ parser = Sitetap::Parser.new(root_dir).parse!
14
+ parser
15
+ end
16
+
17
+ def parse!
18
+ verify_directories
19
+ do_the_loop
20
+ self
21
+ end
22
+
23
+ private
24
+
25
+ # ------------------------------------ References
26
+
27
+ def root
28
+ @root
29
+ end
30
+
31
+ def html_dir
32
+ @html_dir ||= "#{root}/html"
33
+ end
34
+
35
+ def tmp_dir
36
+ @tmp_dir ||= "#{root}/tmp"
37
+ end
38
+
39
+ def md_dir
40
+ @md_dir ||= "#{root}/markdown"
41
+ end
42
+
43
+ def txt_dir
44
+ @txt_dir ||= "#{root}/text"
45
+ end
46
+
47
+ def selector
48
+ @selector ||= "body"
49
+ end
50
+
51
+ def files
52
+ @files ||= Dir.glob("#{html_dir}/**/*.html")
53
+ end
54
+
55
+ # ------------------------------------ Directories
56
+
57
+ def mkdir_p(dir)
58
+ unless Dir.exists?(dir)
59
+ FileUtils.mkdir_p(dir)
60
+ end
61
+ end
62
+
63
+ def verify_directories
64
+ [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
65
+ end
66
+
67
+ def verify_file_directories(files)
68
+ files.each do |file|
69
+ dir = file.split('/')[0..-2].join('/')
70
+ mkdir_p(dir)
71
+ end
72
+ end
73
+
74
+ # ------------------------------------ The Loop
75
+
76
+ def do_the_loop
77
+ files.each do |file|
78
+
79
+ # get the path of the file relative to the html
80
+ # directory (scraped dir)
81
+ #
82
+ file_path = file.gsub(/#{html_dir}\//, '')
83
+
84
+ # clean the contents of the html file so we can work
85
+ # with it
86
+ #
87
+ contents = clean_html(file)
88
+
89
+ # set the references to where the new files will
90
+ # live
91
+ #
92
+ tmp_file_path = "#{tmp_dir}/#{file_path}"
93
+ markdown_file_path = "#{md_dir}/#{file_path}.md"
94
+ text_file_path = "#{txt_dir}/#{file_path}.txt"
95
+
96
+ # find or create directories that will contain the
97
+ # file
98
+ #
99
+ verify_file_directories([
100
+ tmp_file_path,
101
+ markdown_file_path,
102
+ text_file_path
103
+ ])
104
+
105
+ # write a temporary html file with the cleaned-up
106
+ # contents
107
+ #
108
+ write_file(tmp_file_path, contents)
109
+
110
+ # now we hone in on the html contents and strip the
111
+ # stuff we don't need
112
+ #
113
+ adj_contents = filter_html(tmp_file_path)
114
+
115
+ # convert the adjusted html to markdown and write it
116
+ # to file
117
+ #
118
+ write_file(markdown_file_path, html2markdown(adj_contents))
119
+
120
+ # last, we remove all the tags and write the plain
121
+ # text file
122
+ #
123
+ write_file(text_file_path, strip_tags(adj_contents))
124
+
125
+ end
126
+ end
127
+
128
+ # ------------------------------------ Parsing Actions
129
+
130
+ def clean_html(file)
131
+ File.read(file)
132
+ .encode('UTF-8', :invalid => :replace, :undef => :replace)
133
+ .split(' ')
134
+ .to_s
135
+ .gsub(/\\u0000/, '')
136
+ .split('", "')
137
+ .join(' ')
138
+ .gsub(/\\/, '')
139
+ .gsub(/\"\]/, '')
140
+ .gsub(/\[\"/, '')
141
+ .gsub(/[”“]/, '"')
142
+ .gsub(/[’]/, "'")
143
+ .gsub(/[é]/, 'e')
144
+ .gsub(/[–]/, '-')
145
+ end
146
+
147
+ def filter_html(file_path)
148
+ contents = File.read(file_path, :encoding => 'ASCII')
149
+ page = Nokogiri::HTML(contents)
150
+ content = page.css(selector).to_s
151
+ # content = page.css('body').to_s if content == ''
152
+ end
153
+
154
+ def strip_tags(html)
155
+ html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
156
+ html.gsub(/(\ \ )+/, "\n\n")
157
+ end
158
+
159
+ def html2markdown(html)
160
+ ReverseMarkdown.convert(
161
+ html,
162
+ :unknown_tags => :bypass,
163
+ :github_flavored => true
164
+ )
165
+ end
166
+
167
+ # ------------------------------------ Writing Files
168
+
169
+ def write_file(file_path, content)
170
+ File.open(file_path, 'w') { |file| file.write(content) }
171
+ end
172
+
173
+ end
174
+ end
@@ -0,0 +1,63 @@
1
+ require 'fileutils'
2
+
3
+ module Sitetap
4
+ class Scraper
5
+
6
+ def initialize(url)
7
+ @url = url.strip.gsub(/\/$/, '')
8
+ end
9
+
10
+ def self.scrape!(url)
11
+ scraper = Sitetap::Scraper.new(url)
12
+ scraper.scrape!
13
+ scraper
14
+ end
15
+
16
+ def scrape!
17
+ verify_dir
18
+ wget
19
+ self
20
+ end
21
+
22
+ def dir
23
+ root
24
+ end
25
+
26
+ private
27
+
28
+ def domain
29
+ @domain ||= @url.gsub(/http(s)?\:\/\//, '')
30
+ end
31
+
32
+ def root
33
+ @root ||= "#{Dir.pwd}/#{domain}"
34
+ end
35
+
36
+ def html_dir
37
+ "#{root}/html"
38
+ end
39
+
40
+ def verify_dir
41
+ unless Dir.exists?(html_dir)
42
+ FileUtils.mkdir_p(html_dir)
43
+ end
44
+ end
45
+
46
+ def wget_options
47
+ [
48
+ '--recursive',
49
+ '--page-requisites',
50
+ '--html-extension',
51
+ '--convert-links',
52
+ '--restrict-file-names=windows',
53
+ '--span-hosts'
54
+ ]
55
+ end
56
+
57
+ def wget
58
+ system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
59
+ # add `-o #{log_dir}/scrape.log` to store logfile
60
+ end
61
+
62
+ end
63
+ end
@@ -1,3 +1,3 @@
1
1
  module Sitetap
2
- VERSION = "0.0.0"
2
+ VERSION = "0.1.0"
3
3
  end
data/sitetap.gemspec CHANGED
@@ -20,4 +20,6 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.7"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_runtime_dependency "nokogiri"
24
+ spec.add_runtime_dependency "reverse_markdown"
23
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitetap
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean C Davis
@@ -38,10 +38,39 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: reverse_markdown
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
41
69
  description: ''
42
70
  email:
43
71
  - scdavis41@gmail.com
44
- executables: []
72
+ executables:
73
+ - sitetap
45
74
  extensions: []
46
75
  extra_rdoc_files: []
47
76
  files:
@@ -50,7 +79,10 @@ files:
50
79
  - LICENSE.txt
51
80
  - README.md
52
81
  - Rakefile
82
+ - bin/sitetap
53
83
  - lib/sitetap.rb
84
+ - lib/sitetap/parser.rb
85
+ - lib/sitetap/scraper.rb
54
86
  - lib/sitetap/version.rb
55
87
  - sitetap.gemspec
56
88
  homepage: ''