sitetap 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89d0090637f8b89550f043a3fbfc6d7094c4bfa9
4
- data.tar.gz: f58e37144bb456e114d2c44957a11969ed47366e
3
+ metadata.gz: 54e85a540d37ca649e921379f6c45b4142713339
4
+ data.tar.gz: 4f1e3f07367b37a1fb21ff14ecb0780f4f4e962f
5
5
  SHA512:
6
- metadata.gz: f0b5a22231d7239bdff707c26345654d5a60ce97c9fc63922abfb954f531532a8b58edb3a7451f6b661167a404a40efde8f53a59f40f61640a341a1b701a5384
7
- data.tar.gz: 65843f3c0c823ee2ffbeff2f95d70b7f781899565961d7c0b00c5a0b03504cfd3162ad11674ceb188c64415c2049eb948341d757448b113ec04de91621f04700
6
+ metadata.gz: f2cc1b00ec4b37fc26facbf3c27c59595c6fdc5de8502424186fc320d985f550c28f986da30fbddce40c6891795ed51cfc56edade094764402de93f2fd743ee8
7
+ data.tar.gz: 0294b1b4d7dc0e458b3a06fac61ef5623d74b395c55a7e2c99209c765113987f193d662712561d47ea4729b76d887e1bc2d10c239299456a1783c477d02c146b
data/bin/sitetap CHANGED
@@ -3,12 +3,13 @@
3
3
  require 'sitetap/scraper'
4
4
  require 'sitetap/parser'
5
5
 
6
- url = ARGV[0]
6
+ url = ARGV[0]
7
+ selector = ARGV[1]
7
8
 
8
9
  if url.nil? || url == ''
9
10
  puts "Usage: sitetap [URL]"
10
11
  exit
11
12
  else
12
13
  scraper = Sitetap::Scraper.scrape!(url)
13
- parser = Sitetap::Parser.parse!(scraper.dir)
14
+ parser = Sitetap::Parser.parse!(scraper.dir, selector)
14
15
  end
@@ -1,6 +1,7 @@
1
1
  require 'nokogiri'
2
2
  require 'reverse_markdown'
3
3
  require 'fileutils'
4
+ require 'sanitize'
4
5
 
5
6
  module Sitetap
6
7
  class Parser
@@ -9,12 +10,13 @@ module Sitetap
9
10
  @root = root_dir
10
11
  end
11
12
 
12
- def self.parse!(root_dir)
13
- parser = Sitetap::Parser.new(root_dir).parse!
13
+ def self.parse!(root_dir, selector = nil)
14
+ parser = Sitetap::Parser.new(root_dir).parse!(selector)
14
15
  parser
15
16
  end
16
17
 
17
- def parse!
18
+ def parse!(selector = nil)
19
+ @selector = selector unless selector.nil?
18
20
  verify_directories
19
21
  do_the_loop
20
22
  self
@@ -78,24 +80,24 @@ module Sitetap
78
80
 
79
81
  # get the path of the file relative to the html
80
82
  # directory (scraped dir)
81
- #
83
+ #
82
84
  file_path = file.gsub(/#{html_dir}\//, '')
83
85
 
84
86
  # clean the contents of the html file so we can work
85
87
  # with it
86
- #
88
+ #
87
89
  contents = clean_html(file)
88
90
 
89
91
  # set the references to where the new files will
90
92
  # live
91
- #
93
+ #
92
94
  tmp_file_path = "#{tmp_dir}/#{file_path}"
93
95
  markdown_file_path = "#{md_dir}/#{file_path}.md"
94
96
  text_file_path = "#{txt_dir}/#{file_path}.txt"
95
97
 
96
98
  # find or create directories that will contain the
97
99
  # file
98
- #
100
+ #
99
101
  verify_file_directories([
100
102
  tmp_file_path,
101
103
  markdown_file_path,
@@ -104,22 +106,22 @@ module Sitetap
104
106
 
105
107
  # write a temporary html file with the cleaned-up
106
108
  # contents
107
- #
109
+ #
108
110
  write_file(tmp_file_path, contents)
109
111
 
110
112
  # now we hone in on the html contents and strip the
111
113
  # stuff we don't need
112
- #
114
+ #
113
115
  adj_contents = filter_html(tmp_file_path)
114
116
 
115
117
  # convert the adjusted html to markdown and write it
116
118
  # to file
117
- #
119
+ #
118
120
  write_file(markdown_file_path, html2markdown(adj_contents))
119
121
 
120
122
  # last, we remove all the tags and write the plain
121
123
  # text file
122
- #
124
+ #
123
125
  write_file(text_file_path, strip_tags(adj_contents))
124
126
 
125
127
  end
@@ -145,23 +147,22 @@ module Sitetap
145
147
  end
146
148
 
147
149
  def filter_html(file_path)
148
- contents = File.read(file_path, :encoding => 'ASCII')
150
+ contents = File.read(file_path, :encoding => 'UTF-8')
149
151
  page = Nokogiri::HTML(contents)
150
152
  content = page.css(selector).to_s
151
- # content = page.css('body').to_s if content == ''
152
153
  end
153
154
 
154
155
  def strip_tags(html)
155
- html = html.gsub(/(<[^>]*>)|\n|\t/s, ' ')
156
- html.gsub(/(\ \ )+/, "\n\n")
156
+ html = Sanitize.fragment(html)
157
+ html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
157
158
  end
158
159
 
159
160
  def html2markdown(html)
160
161
  ReverseMarkdown.convert(
161
- html,
162
- :unknown_tags => :bypass,
162
+ html,
163
+ :unknown_tags => :bypass,
163
164
  :github_flavored => true
164
- )
165
+ ).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
165
166
  end
166
167
 
167
168
  # ------------------------------------ Writing Files
@@ -1,3 +1,3 @@
1
1
  module Sitetap
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
data/sitetap.gemspec CHANGED
@@ -22,4 +22,5 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
23
  spec.add_runtime_dependency "nokogiri"
24
24
  spec.add_runtime_dependency "reverse_markdown"
25
+ spec.add_runtime_dependency "sanitize"
25
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sitetap
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean C Davis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-17 00:00:00.000000000 Z
11
+ date: 2015-11-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sanitize
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  description: ''
70
84
  email:
71
85
  - scdavis41@gmail.com
@@ -105,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
119
  version: '0'
106
120
  requirements: []
107
121
  rubyforge_project:
108
- rubygems_version: 2.2.0
122
+ rubygems_version: 2.4.6
109
123
  signing_key:
110
124
  specification_version: 4
111
125
  summary: Scrape content from a website.