wriggler 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +67 -31
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb5bcfb711baec8080be58ec329c95066d4cbee4
|
4
|
+
data.tar.gz: 3ff6eb28fd6f06f27398d48f4ada9d9808b73d86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f5fab9e467d49fd8b4f5806381501dccc6c08866af5f756c976ab1c12f42421125c90e79e0999397058b2d4162386bdee03e9a5091ca5e9043e8e8c209489e4
|
7
|
+
data.tar.gz: 48f151ba4e2e2c42853f2fb06670ead00b7f44aa9ed660882c70ea02e6618862e2804408786eaa5197942c201610749ec5123a95d1a12727f6982e576f0a5822
|
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -3,80 +3,116 @@ require "nokogiri"
|
|
3
3
|
require "find"
|
4
4
|
|
5
5
|
module Wriggler
|
6
|
-
|
6
|
+
attr_reader :content, :directory
|
7
7
|
|
8
8
|
def self.crawl(tags=[], directory="")
|
9
|
-
@content
|
10
|
-
@directory
|
9
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
10
|
+
@directory = directory #Current top-level directory
|
11
11
|
|
12
12
|
navigate_directory
|
13
13
|
@content
|
14
14
|
end
|
15
15
|
|
16
|
+
private
|
17
|
+
|
16
18
|
def self.navigate_directory
|
17
|
-
|
18
|
-
|
19
|
-
|
19
|
+
#Set the cwd to the given dir send to gather all nested files from there
|
20
|
+
Dir.chdir(@directory)
|
21
|
+
gather_files
|
20
22
|
end
|
21
23
|
|
22
24
|
def self.gather_files
|
23
|
-
|
25
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
24
26
|
Find.find(@directory) do |file|
|
25
|
-
if is_XML?(file) || is_HTML?(file)
|
27
|
+
if is_XML?(file) || is_HTML?(file) || is_TXT?(file)
|
26
28
|
open_next_file(file)
|
27
29
|
end
|
28
30
|
end
|
29
31
|
end
|
30
32
|
|
31
33
|
def self.open_next_file(file)
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
35
|
+
f = File.open(file)
|
36
|
+
|
37
|
+
if is_HTML?(file)
|
38
|
+
set_HTML(f)
|
39
|
+
elsif is_XML?(file)
|
40
|
+
set_XML(f)
|
41
|
+
elsif is_TXT?(file)
|
42
|
+
set_TXT(f)
|
43
|
+
end
|
40
44
|
end
|
41
45
|
|
42
46
|
def self.is_HTML?(file)
|
43
|
-
|
44
|
-
|
47
|
+
#Determines, using a regex check, if it is an HTML file
|
48
|
+
file =~ /.html/
|
45
49
|
end
|
46
50
|
|
47
51
|
def self.is_XML?(file)
|
48
|
-
|
49
|
-
|
52
|
+
#Determines, using a regex check, if it is an XML file
|
53
|
+
file =~ /.xml/
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.is_TXT?(file)
|
57
|
+
#Determines, using a regex check, if it is a TXT file
|
58
|
+
file =~ /.txt/
|
50
59
|
end
|
51
60
|
|
52
61
|
def self.set_HTML(file)
|
53
|
-
|
54
|
-
|
55
|
-
|
62
|
+
#Set the HTML file into Nokogiri for crawling
|
63
|
+
doc = Nokogiri::HTML(file)
|
64
|
+
crawl_file(doc)
|
56
65
|
end
|
57
66
|
|
58
67
|
def self.set_XML(file)
|
59
|
-
|
60
|
-
|
61
|
-
|
68
|
+
#Set the XML file into Nokogiri for crawling
|
69
|
+
doc = Nokogiri::XML(file)
|
70
|
+
crawl_file(doc)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.set_TXT(file)
|
74
|
+
#Set the TXT file into a readable String for Regex checking
|
75
|
+
doc = File.read(file)
|
76
|
+
txt_content(doc)
|
62
77
|
end
|
63
78
|
|
64
79
|
def self.crawl_file(doc)
|
65
80
|
#Crawl the Nokogiri Object for the file
|
66
81
|
@content.each_key do |key|
|
67
82
|
arr = []
|
68
|
-
if !doc.xpath("//#{key}").empty?
|
83
|
+
if !doc.xpath("//#{key}").empty?
|
69
84
|
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
85
|
+
elsif key == "html"
|
86
|
+
arr << "#{doc}"
|
70
87
|
else
|
71
|
-
|
88
|
+
arr << ""
|
72
89
|
end
|
73
90
|
@content.fetch(key) << arr
|
74
91
|
end
|
75
92
|
end
|
76
93
|
|
77
|
-
def self.
|
78
|
-
|
79
|
-
|
94
|
+
def self.txt_content(doc)
|
95
|
+
#Now run through the raw text and regex out what is inbetween the tags
|
96
|
+
@content.each_key do |key|
|
97
|
+
arr = []
|
98
|
+
if key == "html"
|
99
|
+
arr << "#{doc}"
|
100
|
+
elsif contains_key(doc, key)
|
101
|
+
arr << doc.slice(/<#{key}>(.*)<\/#{key}>/).gsub(/<\/?\w+>/, "")
|
102
|
+
else
|
103
|
+
arr << ""
|
104
|
+
end
|
105
|
+
@content.fetch(key) << arr
|
106
|
+
end
|
80
107
|
end
|
81
108
|
|
109
|
+
def self.contains_key(doc, key)
|
110
|
+
#Checks if the String contains the necessary tags
|
111
|
+
doc.include?("<#{key}>") && doc.include?("</#{key}>")
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.sanitize(text)
|
115
|
+
#Removes any escaped quotes, replaces them
|
116
|
+
text.gsub(/"/, "'").lstrip.chomp
|
117
|
+
end
|
82
118
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|