html_text_gem 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/bin/html_text_gem +7 -0
  2. data/lib/html_text_gem.rb +36 -0
  3. metadata +47 -0
data/bin/html_text_gem ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/evn ruby
2
+
3
+ require "rubygems"
4
+ require "nokogiri"
5
+ #require 'open-uri'
6
+ require "html_text_gem"
7
+
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/evn ruby
2
+ require "rubygems"
3
+ require 'nokogiri'
4
+ #require 'open-uri'
5
+
6
+ def html_to_text(node)
7
+ blocks = %w[div] # put newlines after
8
+ separator = { "br"=>"\n", "br"=>"\n#{'-' * 70}\n" } # content separators
9
+ dup = node.dup
10
+
11
+ # remove whitespaces
12
+ dup.xpath('.//text()').each{ |t| t.content=t.text.gsub(/>\s+</, " ")}
13
+
14
+ # extract urls
15
+ element = dup.at_xpath('//a[text()]')
16
+ element["href"]
17
+
18
+ # swap out the separator
19
+ dup.css(separator.keys.join(',')).each{ |n| n.replace( separator[n.name] ) }
20
+
21
+ # add newlines after each block level element
22
+ dup.css(blocks.join(',')).each{ |n| n.after("\n\n") }
23
+
24
+ # return modified text content
25
+ return dup.text
26
+ end
27
+
28
+ doc = Nokogiri::HTML(open('index.html'), nil, 'UTF-8')
29
+ parse = html_to_text(doc)
30
+ #puts html_to_text(doc)
31
+
32
+ # write to text file
33
+ File.write("snippet.txt", parse)
34
+
35
+
36
+
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_text_gem
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Vy Nguyen
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-10 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A gem to create text files
15
+ email: vnguye36@gmail.com
16
+ executables:
17
+ - html_text_gem
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/html_text_gem.rb
22
+ - bin/html_text_gem
23
+ homepage: https://rubygems.org/profiles/maxdoodle
24
+ licenses: []
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ required_rubygems_version: !ruby/object:Gem::Requirement
36
+ none: false
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 1.8.25
44
+ signing_key:
45
+ specification_version: 3
46
+ summary: Making a test gem
47
+ test_files: []