wriggler 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 92e026f6d129cad6ba890e1c23e98ad8a48a2918
4
- data.tar.gz: 5f759a905eec16613eec0cdcf8e1f0d8e19b4e1c
3
+ metadata.gz: fb5bcfb711baec8080be58ec329c95066d4cbee4
4
+ data.tar.gz: 3ff6eb28fd6f06f27398d48f4ada9d9808b73d86
5
5
  SHA512:
6
- metadata.gz: ca9c5e3a15dc0b0422522b241d01b5b7ac10de666d114a8b87a5114fc4439328323f9a04628022e02d2376a40f83a98a3da20b9921425f1f49a39ce4bda431e0
7
- data.tar.gz: 03d25bae580a894d6251b5cc3c19057971414715ca6f6a3e61561723e8242a1048d0ed81f3665f6892b5911ea5b49cab7374dbc7ba42deaa445519c0cd4007f5
6
+ metadata.gz: 1f5fab9e467d49fd8b4f5806381501dccc6c08866af5f756c976ab1c12f42421125c90e79e0999397058b2d4162386bdee03e9a5091ca5e9043e8e8c209489e4
7
+ data.tar.gz: 48f151ba4e2e2c42853f2fb06670ead00b7f44aa9ed660882c70ea02e6618862e2804408786eaa5197942c201610749ec5123a95d1a12727f6982e576f0a5822
@@ -1,3 +1,3 @@
1
1
  module Wriggler
2
- VERSION = "1.3.0"
2
+ VERSION = "1.4.0"
3
3
  end
data/lib/wriggler.rb CHANGED
@@ -3,80 +3,116 @@ require "nokogiri"
3
3
  require "find"
4
4
 
5
5
  module Wriggler
6
- attr_reader :content, :directory
6
+ attr_reader :content, :directory
7
7
 
8
8
  def self.crawl(tags=[], directory="")
9
- @content = Hash[tags.map {|k| [k, []]}] #Hash with content
10
- @directory = directory #Current top-level directory
9
+ @content = Hash[tags.map {|k| [k, []]}] #Hash with content
10
+ @directory = directory #Current top-level directory
11
11
 
12
12
  navigate_directory
13
13
  @content
14
14
  end
15
15
 
16
+ private
17
+
16
18
  def self.navigate_directory
17
- #Set the cwd to the given dir send to gather all nested files from there
18
- Dir.chdir(@directory)
19
- gather_files
19
+ #Set the cwd to the given dir send to gather all nested files from there
20
+ Dir.chdir(@directory)
21
+ gather_files
20
22
  end
21
23
 
22
24
  def self.gather_files
23
- #Gathers all of the HTML or XML files from this and all subdirectories into an array
25
+ #Gathers all of the HTML or XML files from this and all subdirectories into an array
24
26
  Find.find(@directory) do |file|
25
- if is_XML?(file) || is_HTML?(file)
27
+ if is_XML?(file) || is_HTML?(file) || is_TXT?(file)
26
28
  open_next_file(file)
27
29
  end
28
30
  end
29
31
  end
30
32
 
31
33
  def self.open_next_file(file)
32
- #Opens the next file on the list, depending on the extension passes it to HTML or XML
33
- f = File.open(file)
34
-
35
- if is_HTML?(file)
36
- set_HTML(f)
37
- elsif is_XML?(file)
38
- set_XML(f)
39
- end
34
+ #Opens the next file on the list, depending on the extension passes it to HTML or XML
35
+ f = File.open(file)
36
+
37
+ if is_HTML?(file)
38
+ set_HTML(f)
39
+ elsif is_XML?(file)
40
+ set_XML(f)
41
+ elsif is_TXT?(file)
42
+ set_TXT(f)
43
+ end
40
44
  end
41
45
 
42
46
  def self.is_HTML?(file)
43
- #Determines, using a regex check, if it is an HTML file
44
- file =~ /.html/
47
+ #Determines, using a regex check, if it is an HTML file
48
+ file =~ /.html/
45
49
  end
46
50
 
47
51
  def self.is_XML?(file)
48
- #Determines, using a regex check, if it is an XML file
49
- file =~ /.xml/
52
+ #Determines, using a regex check, if it is an XML file
53
+ file =~ /.xml/
54
+ end
55
+
56
+ def self.is_TXT?(file)
57
+ #Determines, using a regex check, if it is a TXT file
58
+ file =~ /.txt/
50
59
  end
51
60
 
52
61
  def self.set_HTML(file)
53
- #Set the HTML file into Nokogiri for crawling
54
- doc = Nokogiri::HTML(file)
55
- crawl_file(doc)
62
+ #Set the HTML file into Nokogiri for crawling
63
+ doc = Nokogiri::HTML(file)
64
+ crawl_file(doc)
56
65
  end
57
66
 
58
67
  def self.set_XML(file)
59
- #Set the XML file into Nokogiri for crawling
60
- doc = Nokogiri::XML(file)
61
- crawl_file(doc)
68
+ #Set the XML file into Nokogiri for crawling
69
+ doc = Nokogiri::XML(file)
70
+ crawl_file(doc)
71
+ end
72
+
73
+ def self.set_TXT(file)
74
+ #Set the TXT file into a readable String for Regex checking
75
+ doc = File.read(file)
76
+ txt_content(doc)
62
77
  end
63
78
 
64
79
  def self.crawl_file(doc)
65
80
  #Crawl the Nokogiri Object for the file
66
81
  @content.each_key do |key|
67
82
  arr = []
68
- if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
83
+ if !doc.xpath("//#{key}").empty?
69
84
  doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
85
+ elsif key == "html"
86
+ arr << "#{doc}"
70
87
  else
71
- doc.xpath("//#{key}").map{ |_| arr << "" }
88
+ arr << ""
72
89
  end
73
90
  @content.fetch(key) << arr
74
91
  end
75
92
  end
76
93
 
77
- def self.sanitize(text)
78
- #Removes any escaped quotes, replaces them
79
- text.gsub(/"/, "'").lstrip.chomp
94
+ def self.txt_content(doc)
95
+ #Now run through the raw text and regex out what is inbetween the tags
96
+ @content.each_key do |key|
97
+ arr = []
98
+ if key == "html"
99
+ arr << "#{doc}"
100
+ elsif contains_key(doc, key)
101
+ arr << doc.slice(/<#{key}>(.*)<\/#{key}>/).gsub(/<\/?\w+>/, "")
102
+ else
103
+ arr << ""
104
+ end
105
+ @content.fetch(key) << arr
106
+ end
80
107
  end
81
108
 
109
+ def self.contains_key(doc, key)
110
+ #Checks if the String contains the necessary tags
111
+ doc.include?("<#{key}>") && doc.include?("</#{key}>")
112
+ end
113
+
114
+ def self.sanitize(text)
115
+ #Removes any escaped quotes, replaces them
116
+ text.gsub(/"/, "'").lstrip.chomp
117
+ end
82
118
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wriggler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elliott Young
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-10 00:00:00.000000000 Z
11
+ date: 2015-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler