sitetap 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89d0090637f8b89550f043a3fbfc6d7094c4bfa9
4
- data.tar.gz: f58e37144bb456e114d2c44957a11969ed47366e
3
+ metadata.gz: 54e85a540d37ca649e921379f6c45b4142713339
4
+ data.tar.gz: 4f1e3f07367b37a1fb21ff14ecb0780f4f4e962f
5
5
  SHA512:
6
- metadata.gz: f0b5a22231d7239bdff707c26345654d5a60ce97c9fc63922abfb954f531532a8b58edb3a7451f6b661167a404a40efde8f53a59f40f61640a341a1b701a5384
7
- data.tar.gz: 65843f3c0c823ee2ffbeff2f95d70b7f781899565961d7c0b00c5a0b03504cfd3162ad11674ceb188c64415c2049eb948341d757448b113ec04de91621f04700
6
+ metadata.gz: f2cc1b00ec4b37fc26facbf3c27c59595c6fdc5de8502424186fc320d985f550c28f986da30fbddce40c6891795ed51cfc56edade094764402de93f2fd743ee8
7
+ data.tar.gz: 0294b1b4d7dc0e458b3a06fac61ef5623d74b395c55a7e2c99209c765113987f193d662712561d47ea4729b76d887e1bc2d10c239299456a1783c477d02c146b
data/bin/sitetap CHANGED
@@ -3,12 +3,13 @@
3
3
  require 'sitetap/scraper'
4
4
  require 'sitetap/parser'
5
5
 
6
- url = ARGV[0]
6
+ url = ARGV[0]
7
+ selector = ARGV[1]
7
8
 
8
9
  if url.nil? || url == ''
9
10
  puts "Usage: sitetap [URL]"
10
11
  exit
11
12
  else
12
13
  scraper = Sitetap::Scraper.scrape!(url)
13
- parser = Sitetap::Parser.parse!(scraper.dir)
14
+ parser = Sitetap::Parser.parse!(scraper.dir, selector)
14
15
  end
@@ -1,6 +1,7 @@
1
1
  require 'nokogiri'
2
2
  require 'reverse_markdown'
3
3
  require 'fileutils'
4
+ require 'sanitize'
4
5
 
5
6
  module Sitetap
6
7
  class Parser
@@ -9,12 +10,13 @@ module Sitetap
9
10
  @root = root_dir
10
11
  end
11
12
 
12
- def self.parse!(root_dir)
13
- parser = Sitetap::Parser.new(root_dir).parse!
13
+ def self.parse!(root_dir, selector = nil)
14
+ parser = Sitetap::Parser.new(root_dir).parse!(selector)
14
15
  parser
15
16
  end
16
17
 
17
- def parse!
18
+ def parse!(selector = nil)
19
+ @selector = selector unless selector.nil?
18
20
  verify_directories
19
21
  do_the_loop
20
22
  self
@@ -78,24 +80,24 @@ module Sitetap
78
80
 
79
81
  # get the path of the file relative to the html
80
82
  # directory (scraped dir)
81
- #
83
+ #
82
84
  file_path = file.gsub(/#{html_dir}\//, '')
83
85
 
84
86
  # clean the contents of the html file so we can work
85
87
  # with it
86
- #
88
+ #
87
89
  contents = clean_html(file)
88
90
 
89
91
  # set the references to where the new files will
90
92
  # live
91
- #
93
+ #
92
94
  tmp_file_path = "#{tmp_dir}/#{file_path}"
93
95
  markdown_file_path = "#{md_dir}/#{file_path}.md"
94
96
  text_file_path = "#{txt_dir}/#{file_path}.txt"
95
97
 
96
98
  # find or create directories that will contain the
97
99
  # file
98
- #
100
+ #
99
101
  verify_file_directories([
100
102
  tmp_file_path,
101
103
  markdown_file_path,
@@ -104,22 +106,22 @@ module Sitetap
104
106
 
105
107
  # write a temporary html file with the cleaned-up
106
108
  # contents
107
- #
109
+ #
108
110
  write_file(tmp_file_path, contents)
109
111
 
110
112
  # now we hone in on the html contents and strip the
111
113
  # stuff we don't need
112
- #
114
+ #
113
115
  adj_contents = filter_html(tmp_file_path)
114
116
 
115
117
  # convert the adjusted html to markdown and write it
116
118
  # to file
117
- #
119
+ #
118
120
  write_file(markdown_file_path, html2markdown(adj_contents))
119
121
 
120
122
  # last, we remove all the tags and write the plain
121
123
  # text file
122
- #
124
+ #
123
125
  write_file(text_file_path, strip_tags(adj_contents))
124
126
 
125
127
  end
@@ -145,23 +147,22 @@ module Sitetap
145
147
  end
146
148
 
147
149
  def filter_html(file_path)
148
- contents = File.read(file_path, :encoding => 'ASCII')
150
+ contents = File.read(file_path, :encoding => 'UTF-8')
149
151
  page = Nokogiri::HTML(contents)
150
152
  content = page.css(selector).to_s
151
- # content = page.css('body').to_s if content == ''
152
153
  end
153
154
 
154
155
  def strip_tags(html)
155
- html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
156
- html.gsub(/(\ \ )+/, "\n\n")
156
+ html = Sanitize.fragment(html)
157
+ html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
157
158
  end
158
159
 
159
160
  def html2markdown(html)
160
161
  ReverseMarkdown.convert(
161
- html,
162
- :unknown_tags => :bypass,
162
+ html,
163
+ :unknown_tags => :bypass,
163
164
  :github_flavored => true
164
- )
165
+ ).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
165
166
  end
166
167
 
167
168
  # ------------------------------------ Writing Files
@@ -1,3 +1,3 @@
1
1
  module Sitetap
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/sitetap.gemspec CHANGED
@@ -22,4 +22,5 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
23
  spec.add_runtime_dependency "nokogiri"
24
24
  spec.add_runtime_dependency "reverse_markdown"
25
+ spec.add_runtime_dependency "sanitize"
25
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitetap
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean C Davis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-17 00:00:00.000000000 Z
11
+ date: 2015-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sanitize
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  description: ''
70
84
  email:
71
85
  - scdavis41@gmail.com
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
119
  version: '0'
106
120
  requirements: []
107
121
  rubyforge_project:
108
- rubygems_version: 2.2.0
122
+ rubygems_version: 2.4.6
109
123
  signing_key:
110
124
  specification_version: 4
111
125
  summary: Scrape content from a website.