wriggler 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +67 -31
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5bcfb711baec8080be58ec329c95066d4cbee4
|
4
|
+
data.tar.gz: 3ff6eb28fd6f06f27398d48f4ada9d9808b73d86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f5fab9e467d49fd8b4f5806381501dccc6c08866af5f756c976ab1c12f42421125c90e79e0999397058b2d4162386bdee03e9a5091ca5e9043e8e8c209489e4
|
7
|
+
data.tar.gz: 48f151ba4e2e2c42853f2fb06670ead00b7f44aa9ed660882c70ea02e6618862e2804408786eaa5197942c201610749ec5123a95d1a12727f6982e576f0a5822
|
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -3,80 +3,116 @@ require "nokogiri"
|
|
3
3
|
require "find"
|
4
4
|
|
5
5
|
module Wriggler
|
6
|
-
|
6
|
+
attr_reader :content, :directory
|
7
7
|
|
8
8
|
def self.crawl(tags=[], directory="")
|
9
|
-
@content
|
10
|
-
@directory
|
9
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
10
|
+
@directory = directory #Current top-level directory
|
11
11
|
|
12
12
|
navigate_directory
|
13
13
|
@content
|
14
14
|
end
|
15
15
|
|
16
|
+
private
|
17
|
+
|
16
18
|
def self.navigate_directory
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
#Set the cwd to the given dir send to gather all nested files from there
|
20
|
+
Dir.chdir(@directory)
|
21
|
+
gather_files
|
20
22
|
end
|
21
23
|
|
22
24
|
def self.gather_files
|
23
|
-
|
25
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
24
26
|
Find.find(@directory) do |file|
|
25
|
-
if is_XML?(file) || is_HTML?(file)
|
27
|
+
if is_XML?(file) || is_HTML?(file) || is_TXT?(file)
|
26
28
|
open_next_file(file)
|
27
29
|
end
|
28
30
|
end
|
29
31
|
end
|
30
32
|
|
31
33
|
def self.open_next_file(file)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
35
|
+
f = File.open(file)
|
36
|
+
|
37
|
+
if is_HTML?(file)
|
38
|
+
set_HTML(f)
|
39
|
+
elsif is_XML?(file)
|
40
|
+
set_XML(f)
|
41
|
+
elsif is_TXT?(file)
|
42
|
+
set_TXT(f)
|
43
|
+
end
|
40
44
|
end
|
41
45
|
|
42
46
|
def self.is_HTML?(file)
|
43
|
-
|
44
|
-
|
47
|
+
#Determines, using a regex check, if it is an HTML file
|
48
|
+
file =~ /.html/
|
45
49
|
end
|
46
50
|
|
47
51
|
def self.is_XML?(file)
|
48
|
-
|
49
|
-
|
52
|
+
#Determines, using a regex check, if it is an XML file
|
53
|
+
file =~ /.xml/
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.is_TXT?(file)
|
57
|
+
#Determines, using a regex check, if it is a TXT file
|
58
|
+
file =~ /.txt/
|
50
59
|
end
|
51
60
|
|
52
61
|
def self.set_HTML(file)
|
53
|
-
|
54
|
-
|
55
|
-
|
62
|
+
#Set the HTML file into Nokogiri for crawling
|
63
|
+
doc = Nokogiri::HTML(file)
|
64
|
+
crawl_file(doc)
|
56
65
|
end
|
57
66
|
|
58
67
|
def self.set_XML(file)
|
59
|
-
|
60
|
-
|
61
|
-
|
68
|
+
#Set the XML file into Nokogiri for crawling
|
69
|
+
doc = Nokogiri::XML(file)
|
70
|
+
crawl_file(doc)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.set_TXT(file)
|
74
|
+
#Set the TXT file into a readable String for Regex checking
|
75
|
+
doc = File.read(file)
|
76
|
+
txt_content(doc)
|
62
77
|
end
|
63
78
|
|
64
79
|
def self.crawl_file(doc)
|
65
80
|
#Crawl the Nokogiri Object for the file
|
66
81
|
@content.each_key do |key|
|
67
82
|
arr = []
|
68
|
-
if !doc.xpath("//#{key}").empty?
|
83
|
+
if !doc.xpath("//#{key}").empty?
|
69
84
|
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
85
|
+
elsif key == "html"
|
86
|
+
arr << "#{doc}"
|
70
87
|
else
|
71
|
-
|
88
|
+
arr << ""
|
72
89
|
end
|
73
90
|
@content.fetch(key) << arr
|
74
91
|
end
|
75
92
|
end
|
76
93
|
|
77
|
-
def self.
|
78
|
-
|
79
|
-
|
94
|
+
def self.txt_content(doc)
|
95
|
+
#Now run through the raw text and regex out what is inbetween the tags
|
96
|
+
@content.each_key do |key|
|
97
|
+
arr = []
|
98
|
+
if key == "html"
|
99
|
+
arr << "#{doc}"
|
100
|
+
elsif contains_key(doc, key)
|
101
|
+
arr << doc.slice(/<#{key}>(.*)<\/#{key}>/).gsub(/<\/?\w+>/, "")
|
102
|
+
else
|
103
|
+
arr << ""
|
104
|
+
end
|
105
|
+
@content.fetch(key) << arr
|
106
|
+
end
|
80
107
|
end
|
81
108
|
|
109
|
+
def self.contains_key(doc, key)
|
110
|
+
#Checks if the String contains the necessary tags
|
111
|
+
doc.include?("<#{key}>") && doc.include?("</#{key}>")
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.sanitize(text)
|
115
|
+
#Removes any escaped quotes, replaces them
|
116
|
+
text.gsub(/"/, "'").lstrip.chomp
|
117
|
+
end
|
82
118
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|