sitetap 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 54aa226490e4ea05843234b47534780ce0514413
4
- data.tar.gz: 6a2354d47a8fa7ebc1a4838e8ac3b394f6556abd
3
+ metadata.gz: 910630c447b621e63f65047fb027b385913d0a21
4
+ data.tar.gz: 6f2689ccdbfd4897d8e9ac8adce44452be392423
5
5
  SHA512:
6
- metadata.gz: 01ce3d3194cf12fb9c66d30e3c1a3010bf80e41bd6f68725e6d98cce87bef32277d15a3b2e43c53a0f69c9b56393a4aab85ff9d74a28a6b3e25da89ec693e939
7
- data.tar.gz: 07dd9f5e9e0653f06465bedbccf80242d442aac615c47746fd11103902ba0f206daf38f14257d1f8d5882b92deaab8d3e70b7e6e82c2cc42bdaef1f66cd8f017
6
+ metadata.gz: 5201c5d44db48541d76ca615a309968b0bd3956e9a50fb1d9eea2ea1f0e8674fbda53bb95f2dd8251a327451348930bdb86592e398d2ea032509a8caef96f267
7
+ data.tar.gz: 06a8d55f597e7c8289af49469d01b45f279cf6d271c3c29eba4cf38a72df0b791a8208e441ae26832ae6bc689608e2cb786d7f76442733bbedf09c8f0186d484
data/README.md CHANGED
@@ -1,10 +1,16 @@
1
- # Sitetap
1
+ SiteTap
2
+ ==========
2
3
 
3
- TODO: Write a gem description
4
+ SiteTap takes a home page URL and turns into into a packaged directory of:
4
5
 
5
- ## Installation
6
+ * html
7
+ * plain text
8
+ * markdown
6
9
 
7
- Add this line to your application's Gemfile:
10
+ Installation
11
+ ----------
12
+
13
+ To install this to a ruby project, add the following to your `Gemfile`:
8
14
 
9
15
  ```ruby
10
16
  gem 'sitetap'
@@ -12,17 +18,56 @@ gem 'sitetap'
12
18
 
13
19
  And then execute:
14
20
 
15
- $ bundle
21
+ ```text
22
+ $ bundle install
23
+ ```
24
+
25
+ Or install it so you can run it globally:
26
+
27
+ ```text
28
+ $ gem install sitetap
29
+ ```
30
+
31
+ Usage
32
+ ----------
33
+
34
+ Using SiteTap is quite simple. You just run the executable and give it a URL.
35
+
36
+ ```text
37
+ $ sitetap [URL]
38
+ ```
39
+
40
+ So, if I wanted to scrape Sapwood's website, I could do this:
41
+
42
+ ```text
43
+ $ sitetap "http://sapwood.org/"
44
+ ```
45
+
46
+ Within your current directory, this will create the following directory
47
+ structure:
48
+
49
+ ```text
50
+ - sapwood.org
51
+ - html
52
+ - markdown
53
+ - txt
54
+ - tmp
55
+ ```
56
+
57
+ Within each are the converted files from the website.
16
58
 
17
- Or install it yourself as:
59
+ Bugs
60
+ ----------
18
61
 
19
- $ gem install sitetap
62
+ Please [create an issue](https://github.com/seancdavis/sitetap/issues/new) if
63
+ you encounter a bug.
20
64
 
21
- ## Usage
65
+ Contributing
66
+ ----------
22
67
 
23
- TODO: Write usage instructions here
68
+ Missing a feature? Add it!
24
69
 
25
- ## Contributing
70
+ Found a bug? Fix it!
26
71
 
27
72
  1. Fork it ( https://github.com/[my-github-username]/sitetap/fork )
28
73
  2. Create your feature branch (`git checkout -b my-new-feature`)
data/bin/sitetap ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'sitetap/scraper'
4
+ require 'sitetap/parser'
5
+
6
+ url = ARGV[0]
7
+
8
+ if url.nil? || url == ''
9
+ puts "Usage: sitetap [URL]"
10
+ exit
11
+ else
12
+ scraper = Sitetap::Scraper.scrape!(url)
13
+ parser = Sitetap::Parser.parse!(scraper.dir)
14
+ end
@@ -0,0 +1,174 @@
1
+ require 'nokogiri'
2
+ require 'reverse_markdown'
3
+ require 'fileutils'
4
+
5
+ module Sitetap
6
+ class Parser
7
+
8
+ def initialize(root_dir)
9
+ @root = root_dir
10
+ end
11
+
12
+ def self.parse!(root_dir)
13
+ parser = Sitetap::Parser.new(root_dir).parse!
14
+ parser
15
+ end
16
+
17
+ def parse!
18
+ verify_directories
19
+ do_the_loop
20
+ self
21
+ end
22
+
23
+ private
24
+
25
+ # ------------------------------------ References
26
+
27
+ def root
28
+ @root
29
+ end
30
+
31
+ def html_dir
32
+ @html_dir ||= "#{root}/html"
33
+ end
34
+
35
+ def tmp_dir
36
+ @tmp_dir ||= "#{root}/tmp"
37
+ end
38
+
39
+ def md_dir
40
+ @md_dir ||= "#{root}/markdown"
41
+ end
42
+
43
+ def txt_dir
44
+ @txt_dir ||= "#{root}/text"
45
+ end
46
+
47
+ def selector
48
+ @selector ||= "body"
49
+ end
50
+
51
+ def files
52
+ @files ||= Dir.glob("#{html_dir}/**/*.html")
53
+ end
54
+
55
+ # ------------------------------------ Directories
56
+
57
+ def mkdir_p(dir)
58
+ unless Dir.exists?(dir)
59
+ FileUtils.mkdir_p(dir)
60
+ end
61
+ end
62
+
63
+ def verify_directories
64
+ [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
65
+ end
66
+
67
+ def verify_file_directories(files)
68
+ files.each do |file|
69
+ dir = file.split('/')[0..-2].join('/')
70
+ mkdir_p(dir)
71
+ end
72
+ end
73
+
74
+ # ------------------------------------ The Loop
75
+
76
+ def do_the_loop
77
+ files.each do |file|
78
+
79
+ # get the path of the file relative to the html
80
+ # directory (scraped dir)
81
+ #
82
+ file_path = file.gsub(/#{html_dir}\//, '')
83
+
84
+ # clean the contents of the html file so we can work
85
+ # with it
86
+ #
87
+ contents = clean_html(file)
88
+
89
+ # set the references to where the new files will
90
+ # live
91
+ #
92
+ tmp_file_path = "#{tmp_dir}/#{file_path}"
93
+ markdown_file_path = "#{md_dir}/#{file_path}.md"
94
+ text_file_path = "#{txt_dir}/#{file_path}.txt"
95
+
96
+ # find or create directories that will contain the
97
+ # file
98
+ #
99
+ verify_file_directories([
100
+ tmp_file_path,
101
+ markdown_file_path,
102
+ text_file_path
103
+ ])
104
+
105
+ # write a temporary html file with the cleaned-up
106
+ # contents
107
+ #
108
+ write_file(tmp_file_path, contents)
109
+
110
+ # now we hone in on the html contents and strip the
111
+ # stuff we don't need
112
+ #
113
+ adj_contents = filter_html(tmp_file_path)
114
+
115
+ # convert the adjusted html to markdown and write it
116
+ # to file
117
+ #
118
+ write_file(markdown_file_path, html2markdown(adj_contents))
119
+
120
+ # last, we remove all the tags and write the plain
121
+ # text file
122
+ #
123
+ write_file(text_file_path, strip_tags(adj_contents))
124
+
125
+ end
126
+ end
127
+
128
+ # ------------------------------------ Parsing Actions
129
+
130
+ def clean_html(file)
131
+ File.read(file)
132
+ .encode('UTF-8', :invalid => :replace, :undef => :replace)
133
+ .split(' ')
134
+ .to_s
135
+ .gsub(/\\u0000/, '')
136
+ .split('", "')
137
+ .join(' ')
138
+ .gsub(/\\/, '')
139
+ .gsub(/\"\]/, '')
140
+ .gsub(/\[\"/, '')
141
+ .gsub(/[”“]/, '"')
142
+ .gsub(/[’]/, "'")
143
+ .gsub(/[é]/, 'e')
144
+ .gsub(/[–]/, '-')
145
+ end
146
+
147
+ def filter_html(file_path)
148
+ contents = File.read(file_path, :encoding => 'ASCII')
149
+ page = Nokogiri::HTML(contents)
150
+ content = page.css(selector).to_s
151
+ # content = page.css('body').to_s if content == ''
152
+ end
153
+
154
+ def strip_tags(html)
155
+ html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
156
+ html.gsub(/(\ \ )+/, "\n\n")
157
+ end
158
+
159
+ def html2markdown(html)
160
+ ReverseMarkdown.convert(
161
+ html,
162
+ :unknown_tags => :bypass,
163
+ :github_flavored => true
164
+ )
165
+ end
166
+
167
+ # ------------------------------------ Writing Files
168
+
169
+ def write_file(file_path, content)
170
+ File.open(file_path, 'w') { |file| file.write(content) }
171
+ end
172
+
173
+ end
174
+ end
@@ -0,0 +1,63 @@
1
+ require 'fileutils'
2
+
3
+ module Sitetap
4
+ class Scraper
5
+
6
+ def initialize(url)
7
+ @url = url.strip.gsub(/\/$/, '')
8
+ end
9
+
10
+ def self.scrape!(url)
11
+ scraper = Sitetap::Scraper.new(url)
12
+ scraper.scrape!
13
+ scraper
14
+ end
15
+
16
+ def scrape!
17
+ verify_dir
18
+ wget
19
+ self
20
+ end
21
+
22
+ def dir
23
+ root
24
+ end
25
+
26
+ private
27
+
28
+ def domain
29
+ @domain ||= @url.gsub(/http(s)?\:\/\//, '')
30
+ end
31
+
32
+ def root
33
+ @root ||= "#{Dir.pwd}/#{domain}"
34
+ end
35
+
36
+ def html_dir
37
+ "#{root}/html"
38
+ end
39
+
40
+ def verify_dir
41
+ unless Dir.exists?(html_dir)
42
+ FileUtils.mkdir_p(html_dir)
43
+ end
44
+ end
45
+
46
+ def wget_options
47
+ [
48
+ '--recursive',
49
+ '--page-requisites',
50
+ '--html-extension',
51
+ '--convert-links',
52
+ '--restrict-file-names=windows',
53
+ '--span-hosts'
54
+ ]
55
+ end
56
+
57
+ def wget
58
+ system("cd #{html_dir}; wget #{wget_options.join(' ')} --domains #{domain} #{@url}; cd ../../")
59
+ # add `-o #{log_dir}/scrape.log` to store logfile
60
+ end
61
+
62
+ end
63
+ end
@@ -1,3 +1,3 @@
1
1
  module Sitetap
2
- VERSION = "0.0.0"
2
+ VERSION = "0.1.0"
3
3
  end
data/sitetap.gemspec CHANGED
@@ -20,4 +20,6 @@ Gem::Specification.new do |spec|
20
20
 
21
21
  spec.add_development_dependency "bundler", "~> 1.7"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_runtime_dependency "nokogiri"
24
+ spec.add_runtime_dependency "reverse_markdown"
23
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitetap
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean C Davis
@@ -38,10 +38,39 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: reverse_markdown
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
41
69
  description: ''
42
70
  email:
43
71
  - scdavis41@gmail.com
44
- executables: []
72
+ executables:
73
+ - sitetap
45
74
  extensions: []
46
75
  extra_rdoc_files: []
47
76
  files:
@@ -50,7 +79,10 @@ files:
50
79
  - LICENSE.txt
51
80
  - README.md
52
81
  - Rakefile
82
+ - bin/sitetap
53
83
  - lib/sitetap.rb
84
+ - lib/sitetap/parser.rb
85
+ - lib/sitetap/scraper.rb
54
86
  - lib/sitetap/version.rb
55
87
  - sitetap.gemspec
56
88
  homepage: ''