wriggler 1.3.0 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 92e026f6d129cad6ba890e1c23e98ad8a48a2918
4
- data.tar.gz: 5f759a905eec16613eec0cdcf8e1f0d8e19b4e1c
3
+ metadata.gz: fb5bcfb711baec8080be58ec329c95066d4cbee4
4
+ data.tar.gz: 3ff6eb28fd6f06f27398d48f4ada9d9808b73d86
5
5
  SHA512:
6
- metadata.gz: ca9c5e3a15dc0b0422522b241d01b5b7ac10de666d114a8b87a5114fc4439328323f9a04628022e02d2376a40f83a98a3da20b9921425f1f49a39ce4bda431e0
7
- data.tar.gz: 03d25bae580a894d6251b5cc3c19057971414715ca6f6a3e61561723e8242a1048d0ed81f3665f6892b5911ea5b49cab7374dbc7ba42deaa445519c0cd4007f5
6
+ metadata.gz: 1f5fab9e467d49fd8b4f5806381501dccc6c08866af5f756c976ab1c12f42421125c90e79e0999397058b2d4162386bdee03e9a5091ca5e9043e8e8c209489e4
7
+ data.tar.gz: 48f151ba4e2e2c42853f2fb06670ead00b7f44aa9ed660882c70ea02e6618862e2804408786eaa5197942c201610749ec5123a95d1a12727f6982e576f0a5822
@@ -1,3 +1,3 @@
1
1
  module Wriggler
2
- VERSION = "1.3.0"
2
+ VERSION = "1.4.0"
3
3
  end
data/lib/wriggler.rb CHANGED
@@ -3,80 +3,116 @@ require "nokogiri"
3
3
  require "find"
4
4
 
5
5
  module Wriggler
6
- attr_reader :content, :directory
6
+ attr_reader :content, :directory
7
7
 
8
8
  def self.crawl(tags=[], directory="")
9
- @content = Hash[tags.map {|k| [k, []]}] #Hash with content
10
- @directory = directory #Current top-level directory
9
+ @content = Hash[tags.map {|k| [k, []]}] #Hash with content
10
+ @directory = directory #Current top-level directory
11
11
 
12
12
  navigate_directory
13
13
  @content
14
14
  end
15
15
 
16
+ private
17
+
16
18
  def self.navigate_directory
17
- #Set the cwd to the given dir send to gather all nested files from there
18
- Dir.chdir(@directory)
19
- gather_files
19
+ #Set the cwd to the given dir send to gather all nested files from there
20
+ Dir.chdir(@directory)
21
+ gather_files
20
22
  end
21
23
 
22
24
  def self.gather_files
23
- #Gathers all of the HTML or XML files from this and all subdirectories into an array
25
+ #Gathers all of the HTML or XML files from this and all subdirectories into an array
24
26
  Find.find(@directory) do |file|
25
- if is_XML?(file) || is_HTML?(file)
27
+ if is_XML?(file) || is_HTML?(file) || is_TXT?(file)
26
28
  open_next_file(file)
27
29
  end
28
30
  end
29
31
  end
30
32
 
31
33
  def self.open_next_file(file)
32
- #Opens the next file on the list, depending on the extension passes it to HTML or XML
33
- f = File.open(file)
34
-
35
- if is_HTML?(file)
36
- set_HTML(f)
37
- elsif is_XML?(file)
38
- set_XML(f)
39
- end
34
+ #Opens the next file on the list, depending on the extension passes it to HTML or XML
35
+ f = File.open(file)
36
+
37
+ if is_HTML?(file)
38
+ set_HTML(f)
39
+ elsif is_XML?(file)
40
+ set_XML(f)
41
+ elsif is_TXT?(file)
42
+ set_TXT(f)
43
+ end
40
44
  end
41
45
 
42
46
  def self.is_HTML?(file)
43
- #Determines, using a regex check, if it is an HTML file
44
- file =~ /.html/
47
+ #Determines, using a regex check, if it is an HTML file
48
+ file =~ /.html/
45
49
  end
46
50
 
47
51
  def self.is_XML?(file)
48
- #Determines, using a regex check, if it is an XML file
49
- file =~ /.xml/
52
+ #Determines, using a regex check, if it is an XML file
53
+ file =~ /.xml/
54
+ end
55
+
56
+ def self.is_TXT?(file)
57
+ #Determines, using a regex check, if it is a TXT file
58
+ file =~ /.txt/
50
59
  end
51
60
 
52
61
  def self.set_HTML(file)
53
- #Set the HTML file into Nokogiri for crawling
54
- doc = Nokogiri::HTML(file)
55
- crawl_file(doc)
62
+ #Set the HTML file into Nokogiri for crawling
63
+ doc = Nokogiri::HTML(file)
64
+ crawl_file(doc)
56
65
  end
57
66
 
58
67
  def self.set_XML(file)
59
- #Set the XML file into Nokogiri for crawling
60
- doc = Nokogiri::XML(file)
61
- crawl_file(doc)
68
+ #Set the XML file into Nokogiri for crawling
69
+ doc = Nokogiri::XML(file)
70
+ crawl_file(doc)
71
+ end
72
+
73
+ def self.set_TXT(file)
74
+ #Set the TXT file into a readable String for Regex checking
75
+ doc = File.read(file)
76
+ txt_content(doc)
62
77
  end
63
78
 
64
79
  def self.crawl_file(doc)
65
80
  #Crawl the Nokogiri Object for the file
66
81
  @content.each_key do |key|
67
82
  arr = []
68
- if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
83
+ if !doc.xpath("//#{key}").empty?
69
84
  doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
85
+ elsif key == "html"
86
+ arr << "#{doc}"
70
87
  else
71
- doc.xpath("//#{key}").map{ |_| arr << "" }
88
+ arr << ""
72
89
  end
73
90
  @content.fetch(key) << arr
74
91
  end
75
92
  end
76
93
 
77
- def self.sanitize(text)
78
- #Removes any escaped quotes, replaces them
79
- text.gsub(/"/, "'").lstrip.chomp
94
+ def self.txt_content(doc)
95
+ #Now run through the raw text and regex out what is inbetween the tags
96
+ @content.each_key do |key|
97
+ arr = []
98
+ if key == "html"
99
+ arr << "#{doc}"
100
+ elsif contains_key(doc, key)
101
+ arr << doc.slice(/<#{key}>(.*)<\/#{key}>/).gsub(/<\/?\w+>/, "")
102
+ else
103
+ arr << ""
104
+ end
105
+ @content.fetch(key) << arr
106
+ end
80
107
  end
81
108
 
109
+ def self.contains_key(doc, key)
110
+ #Checks if the String contains the necessary tags
111
+ doc.include?("<#{key}>") && doc.include?("</#{key}>")
112
+ end
113
+
114
+ def self.sanitize(text)
115
+ #Removes any escaped quotes, replaces them
116
+ text.gsub(/"/, "'").lstrip.chomp
117
+ end
82
118
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wriggler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elliott Young
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-10 00:00:00.000000000 Z
11
+ date: 2015-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler