wriggler 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wriggler.rb +5 -42
- data/lib/wriggler/version.rb +1 -1
- metadata +3 -9
- data/dirtest/nested_fldr/test5.xml +0 -1
- data/dirtest/tag_content.csv +0 -5
- data/dirtest/test1.xml +0 -31
- data/dirtest/test2.xml +0 -30
- data/dirtest/test3.xml +0 -30
- data/dirtest/test4.html +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d5737be7d530672feb74afaf6c76a114e162fd0
|
4
|
+
data.tar.gz: 52cf928fe43f502489440a5c2fe71a3d204fc858
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5da62668ff4fb22d91f34632f06ddb89aa21d3cbee37bf9519006066cf06f688992dfe313c7dca59ad209321e6f1e8552485fbfe3568305d9a004419d1b4885
|
7
|
+
data.tar.gz: d437debf056c35d0ebe570776a4db226c8ef00d31d503c467046c5feca5d6a68f1bec680bf258ea949e17cbbcd99a1bf2fbbeebb7f6c03ed2d013b2dc39c933a
|
data/lib/wriggler.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "wriggler/version"
|
2
2
|
require "nokogiri"
|
3
3
|
require "find"
|
4
|
+
require "utf8_utils"
|
4
5
|
|
5
6
|
module Wriggler
|
6
7
|
attr_reader :content, :directory
|
@@ -17,7 +18,7 @@ module Wriggler
|
|
17
18
|
|
18
19
|
def self.navigate_directory
|
19
20
|
#Set the cwd to the given dir send to gather all nested files from there
|
20
|
-
Dir.chdir(@directory)
|
21
|
+
Dir.chdir(@directory)
|
21
22
|
gather_files
|
22
23
|
end
|
23
24
|
|
@@ -38,8 +39,6 @@ module Wriggler
|
|
38
39
|
set_HTML(f)
|
39
40
|
elsif is_XML?(file)
|
40
41
|
set_XML(f)
|
41
|
-
elsif is_TXT?(file)
|
42
|
-
set_TXT(f)
|
43
42
|
end
|
44
43
|
end
|
45
44
|
|
@@ -53,11 +52,6 @@ module Wriggler
|
|
53
52
|
file =~ /.xml/
|
54
53
|
end
|
55
54
|
|
56
|
-
def self.is_TXT?(file)
|
57
|
-
#Determines, using a regex check, if it is a TXT file
|
58
|
-
file =~ /.txt/
|
59
|
-
end
|
60
|
-
|
61
55
|
def self.set_HTML(file)
|
62
56
|
#Set the HTML file into Nokogiri for crawling
|
63
57
|
doc = Nokogiri::HTML(file)
|
@@ -70,49 +64,18 @@ module Wriggler
|
|
70
64
|
crawl_file(doc)
|
71
65
|
end
|
72
66
|
|
73
|
-
def self.set_TXT(file)
|
74
|
-
#Set the TXT file into a readable String for Regex checking
|
75
|
-
doc = File.read(file)
|
76
|
-
txt_content(doc)
|
77
|
-
end
|
78
|
-
|
79
67
|
def self.crawl_file(doc)
|
80
68
|
#Crawl the Nokogiri Object for the file
|
81
69
|
@content.each_key do |key|
|
82
70
|
arr = []
|
83
|
-
if !doc.
|
84
|
-
doc.
|
85
|
-
elsif key == "html"
|
86
|
-
arr << "#{doc}"
|
87
|
-
else
|
88
|
-
arr << ""
|
89
|
-
end
|
90
|
-
@content.fetch(key) << arr
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
def self.txt_content(doc)
|
95
|
-
#Now run through the raw text and regex out what is inbetween the tags
|
96
|
-
@content.each_key do |key|
|
97
|
-
arr = []
|
98
|
-
if key == "html"
|
71
|
+
if !doc.css("#{key}").empty?
|
72
|
+
doc.css("#{key}").map{ |tag| arr << sanitize(tag.text) }
|
73
|
+
elsif key == "html" || key == "xml"
|
99
74
|
arr << "#{doc}"
|
100
|
-
elsif contains_key(doc, key)
|
101
|
-
arr << doc.slice(/<#{key}>(.*)<\/#{key}>/).gsub(/<\/?\w+>/, "")
|
102
75
|
else
|
103
76
|
arr << ""
|
104
77
|
end
|
105
78
|
@content.fetch(key) << arr
|
106
79
|
end
|
107
80
|
end
|
108
|
-
|
109
|
-
def self.contains_key(doc, key)
|
110
|
-
#Checks if the String contains the necessary tags
|
111
|
-
doc.include?("<#{key}>") && doc.include?("</#{key}>")
|
112
|
-
end
|
113
|
-
|
114
|
-
def self.sanitize(text)
|
115
|
-
#Removes any escaped quotes, replaces them
|
116
|
-
text.gsub(/"/, "'").lstrip.chomp
|
117
|
-
end
|
118
81
|
end
|
data/lib/wriggler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -99,12 +99,6 @@ files:
|
|
99
99
|
- Rakefile
|
100
100
|
- bin/console
|
101
101
|
- bin/setup
|
102
|
-
- dirtest/nested_fldr/test5.xml
|
103
|
-
- dirtest/tag_content.csv
|
104
|
-
- dirtest/test1.xml
|
105
|
-
- dirtest/test2.xml
|
106
|
-
- dirtest/test3.xml
|
107
|
-
- dirtest/test4.html
|
108
102
|
- lib/wriggler.rb
|
109
103
|
- lib/wriggler/version.rb
|
110
104
|
- wriggler.gemspec
|
@@ -129,7 +123,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
123
|
version: '0'
|
130
124
|
requirements: []
|
131
125
|
rubyforge_project:
|
132
|
-
rubygems_version: 2.
|
126
|
+
rubygems_version: 2.2.2
|
133
127
|
signing_key:
|
134
128
|
specification_version: 4
|
135
129
|
summary: A Gem designed to crawl through a local directory of HTML/XML files and pull
|
@@ -1 +0,0 @@
|
|
1
|
-
<test>If this appears it works</test>
|
data/dirtest/tag_content.csv
DELETED
@@ -1,5 +0,0 @@
|
|
1
|
-
character,test,name,sitcom
|
2
|
-
"[""Al Bundy"", ""Bud Bundy"", ""Marcy Darcy"", ""Larry Appleton"", ""Balki Bartokomous"", ""John 'Hannibal' Smith"", ""Templeton 'Face' Peck"", ""'B.A.' Baracus"", ""'Howling Mad' Murdock""]","[""Al Bundy"", ""Bud Bundy"", ""Marcy Darcy"", ""Larry Appleton"", ""Balki Bartokomous"", ""John 'Hannibal' Smith"", ""Templeton 'Face' Peck"", ""'B.A.' Baracus"", ""'Howling Mad' Murdock""]","[""Al Bundy"", ""Bud Bundy"", ""Marcy Darcy"", ""Larry Appleton"", ""Balki Bartokomous"", ""John 'Hannibal' Smith"", ""Templeton 'Face' Peck"", ""'B.A.' Baracus"", ""'Howling Mad' Murdock""]"
|
3
|
-
"[""If this appears it works""]","[""This is different""]"
|
4
|
-
"[""Married with Children"", ""Perfect Strangers"", ""The A-Team""]","[""Married with Children"", ""Perfect Strangers"", ""The A-Team""]","[""Married with Children"", ""Perfect Strangers"", ""The A-Team""]"
|
5
|
-
"[""This is different\n Married with Children\n \n Al Bundy\n Bud Bundy\n Marcy Darcy\n \n "", ""Perfect Strangers\n \n Larry Appleton\n Balki Bartokomous\n \n ""]","[""Married with Children\n \n Al Bundy\n Bud Bundy\n Marcy Darcy\n \n "", ""Perfect Strangers\n \n Larry Appleton\n Balki Bartokomous\n \n ""]","[""Married with Children\n \n Al Bundy\n Bud Bundy\n Marcy Darcy\n \n "", ""Perfect Strangers\n \n Larry Appleton\n Balki Bartokomous\n \n ""]"
|
data/dirtest/test1.xml
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<test>This is different</test>
|
5
|
-
<name>Married with Children</name>
|
6
|
-
<characters>
|
7
|
-
<character>Al Bundy</character>
|
8
|
-
<character>Bud Bundy</character>
|
9
|
-
<character>Marcy Darcy</character>
|
10
|
-
</characters>
|
11
|
-
</sitcom>
|
12
|
-
<sitcom>
|
13
|
-
<name>Perfect Strangers</name>
|
14
|
-
<characters>
|
15
|
-
<character>Larry Appleton</character>
|
16
|
-
<character>Balki Bartokomous</character>
|
17
|
-
</characters>
|
18
|
-
</sitcom>
|
19
|
-
</sitcoms>
|
20
|
-
<dramas>
|
21
|
-
<drama>
|
22
|
-
<name>The A-Team</name>
|
23
|
-
<characters>
|
24
|
-
<character>John "Hannibal" Smith</character>
|
25
|
-
<character>Templeton "Face" Peck</character>
|
26
|
-
<character>"B.A." Baracus</character>
|
27
|
-
<character>"Howling Mad" Murdock</character>
|
28
|
-
</characters>
|
29
|
-
</drama>
|
30
|
-
</dramas>
|
31
|
-
</root>
|
data/dirtest/test2.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|
data/dirtest/test3.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|