wriggler 0.1.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/dirtest/nested_fldr/test5.xml +1 -0
- data/dirtest/test1.xml +1 -0
- data/dirtest/test4.html +7 -0
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +40 -21
- data/test.rb +60 -58
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d741f12caa9d0cae037e2689e76ff0aa6adee13
|
4
|
+
data.tar.gz: d84a237b2f9c49ef76df0a1fa7db330d32d031ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fd896183312ea3a2f9ab5e00836dfaf12f7af1486796deedd2f80f1c3f851ae437d29d8c48566a8f89c8ff04148b51c797ad82a8518fd65fb025178f75c80c0
|
7
|
+
data.tar.gz: 0aa7b338518a6285e5bb3aac82ff401b0157a3b72f805b49f2b066851e1aa81f22a7f03924db2625ea50eece17c6492942424664e380fa2c50ab89d5dd0f0b8d
|
@@ -0,0 +1 @@
|
|
1
|
+
<test>If this appears it works</test>
|
data/dirtest/test1.xml
CHANGED
data/dirtest/test4.html
ADDED
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -1,32 +1,43 @@
|
|
1
1
|
require "wriggler/version"
|
2
2
|
require "nokogiri"
|
3
|
+
require "find"
|
3
4
|
|
4
5
|
module Wriggler
|
5
6
|
attr_reader :content, :directory
|
6
7
|
|
7
|
-
def crawl(tags=[], directory=""
|
8
|
-
|
9
|
-
|
10
|
-
@directory = directory #Directory to grab files from
|
8
|
+
def self.crawl(tags=[], directory="")
|
9
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
10
|
+
@directory = directory #Current top-level directory
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
navigate_directory
|
13
|
+
Writer.write(@content)
|
14
|
+
end
|
15
15
|
|
16
16
|
private
|
17
17
|
|
18
|
-
def navigate_directory
|
18
|
+
def self.navigate_directory
|
19
19
|
#Set the cwd to the given dir send to gather all nested files from there
|
20
20
|
Dir.chdir(@directory)
|
21
|
-
gather_files
|
21
|
+
open_files(gather_files)
|
22
22
|
end
|
23
23
|
|
24
|
-
def gather_files
|
25
|
-
#Gathers all of the HTML or XML files from this and all subdirectories
|
24
|
+
def self.gather_files
|
25
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
26
|
+
file_array = []
|
27
|
+
Find.find(@directory) do |file|
|
28
|
+
file_array << file if file.match(/\.xml\Z/) || file.match(/\.html\Z/)
|
29
|
+
end
|
30
|
+
file_array
|
31
|
+
end
|
26
32
|
|
33
|
+
def self.open_files(file_array)
|
34
|
+
#Opens all the files in the file_array
|
35
|
+
file_array.each do |file|
|
36
|
+
open_next_file(file)
|
37
|
+
end
|
27
38
|
end
|
28
39
|
|
29
|
-
def open_next_file(file)
|
40
|
+
def self.open_next_file(file)
|
30
41
|
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
31
42
|
f = File.open(file)
|
32
43
|
|
@@ -37,44 +48,52 @@ module Wriggler
|
|
37
48
|
end
|
38
49
|
end
|
39
50
|
|
40
|
-
def is_HTML?(file)
|
51
|
+
def self.is_HTML?(file)
|
41
52
|
#Determines, using a regex check, if it is an HTML file
|
42
53
|
file =~ /.html/
|
43
54
|
end
|
44
55
|
|
45
|
-
def is_XML?(file)
|
56
|
+
def self.is_XML?(file)
|
46
57
|
#Determines, using a regex check, if it is an XML file
|
47
58
|
file =~ /.xml/
|
48
59
|
end
|
49
60
|
|
50
|
-
def set_HTML(file)
|
61
|
+
def self.set_HTML(file)
|
51
62
|
#Set the HTML file into Nokogiri for crawling
|
52
63
|
doc = Nokogiri::HTML(file)
|
53
64
|
crawl_file(doc)
|
54
65
|
end
|
55
66
|
|
56
|
-
def set_XML(file)
|
67
|
+
def self.set_XML(file)
|
57
68
|
#Set the XML file into Nokogiri for crawling
|
58
69
|
doc = Nokogiri::XML(file)
|
59
70
|
crawl_file(doc)
|
60
71
|
end
|
61
72
|
|
62
|
-
def crawl_file(doc)
|
73
|
+
def self.crawl_file(doc)
|
63
74
|
#Crawl the Nokogiri Object for the file
|
64
75
|
@content.each_key do |key|
|
76
|
+
arr = []
|
65
77
|
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
66
|
-
doc.xpath("//#{key}").map{ |tag|
|
78
|
+
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
67
79
|
end
|
80
|
+
fill_content(arr, key)
|
68
81
|
end
|
69
82
|
end
|
70
83
|
|
71
|
-
def sanitize(text)
|
84
|
+
def self.sanitize(text)
|
72
85
|
#Removes any escaped quotes, replaces them
|
73
|
-
text.gsub(/"/, "'")
|
86
|
+
text.gsub(/"/, "'").lstrip.chomp
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.fill_content(arr, key)
|
90
|
+
#Doesn't shovel if there is no content found for the specific tag
|
91
|
+
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
74
92
|
end
|
75
93
|
end
|
76
94
|
|
77
95
|
module Writer
|
78
|
-
def
|
96
|
+
def write(content)
|
97
|
+
@content = content
|
79
98
|
end
|
80
99
|
end
|
data/test.rb
CHANGED
@@ -1,91 +1,93 @@
|
|
1
1
|
require "nokogiri"
|
2
|
+
require "find"
|
2
3
|
|
3
4
|
class Wriggler
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def initialize(tags=[], directory="", subdirectories=true)
|
6
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
7
|
+
@directory = directory #Current top-level directory
|
8
|
+
@subdirectories = subdirectories #Default true for the existence of subdirs
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
navigate_directory
|
11
|
+
p @content
|
12
|
+
# Writer.write(@content)
|
13
|
+
end
|
12
14
|
|
13
15
|
private
|
14
16
|
|
15
17
|
def navigate_directory
|
16
|
-
|
17
|
-
|
18
|
-
|
18
|
+
#Set the cwd to the given dir send to gather all nested files from there
|
19
|
+
Dir.chdir(@directory)
|
20
|
+
open_files(gather_files)
|
19
21
|
end
|
20
22
|
|
21
23
|
def gather_files
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
puts ""
|
35
|
-
open_next_file("test3.xml")
|
36
|
-
puts "=============="
|
37
|
-
puts "3:"
|
38
|
-
p @content
|
39
|
-
puts "=============="
|
40
|
-
puts ""
|
24
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
25
|
+
file_array = []
|
26
|
+
Find.find(@directory) do |f|
|
27
|
+
file_array << f if f.match(/\.xml\Z/) || f.match(/\.html\Z/)
|
28
|
+
end
|
29
|
+
file_array
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_files(file_array)
|
33
|
+
file_array.each do |file|
|
34
|
+
open_next_file(file)
|
35
|
+
end
|
41
36
|
end
|
42
37
|
|
43
38
|
def open_next_file(file)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
39
|
+
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
40
|
+
f = File.open(file)
|
41
|
+
|
42
|
+
if is_HTML?(file)
|
43
|
+
set_HTML(f)
|
44
|
+
elsif is_XML?(file)
|
45
|
+
set_XML(f)
|
46
|
+
end
|
52
47
|
end
|
53
48
|
|
54
|
-
def
|
55
|
-
|
56
|
-
|
49
|
+
def is_HTML?(file)
|
50
|
+
#Determines, using a regex check, if it is an HTML file
|
51
|
+
file =~ /.html/
|
57
52
|
end
|
58
53
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
54
|
+
def is_XML?(file)
|
55
|
+
#Determines, using a regex check, if it is an XML file
|
56
|
+
file =~ /.xml/
|
62
57
|
end
|
63
58
|
|
64
59
|
def set_HTML(file)
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
#Set the HTML file into Nokogiri for crawling
|
61
|
+
doc = Nokogiri::HTML(file)
|
62
|
+
crawl_file(doc)
|
68
63
|
end
|
69
64
|
|
70
65
|
def set_XML(file)
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
#Set the XML file into Nokogiri for crawling
|
67
|
+
doc = Nokogiri::XML(file)
|
68
|
+
crawl_file(doc)
|
74
69
|
end
|
75
70
|
|
76
71
|
def crawl_file(doc)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
72
|
+
#Crawl the Nokogiri Object for the file
|
73
|
+
@content.each_key do |key|
|
74
|
+
arr = []
|
75
|
+
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
76
|
+
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
77
|
+
end
|
78
|
+
fill_content(arr, key)
|
79
|
+
end
|
83
80
|
end
|
84
81
|
|
85
82
|
def sanitize(text)
|
86
|
-
|
87
|
-
|
83
|
+
#Removes any escaped quotes, replaces them
|
84
|
+
text.gsub(/"/, "'").lstrip.chomp
|
85
|
+
end
|
86
|
+
|
87
|
+
def fill_content(arr, key)
|
88
|
+
#Doesn't shovel if there is no content found for the specific tag
|
89
|
+
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
88
90
|
end
|
89
91
|
end
|
90
92
|
|
91
|
-
test = Wriggler.new(["character", "content", "name", "title"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|
93
|
+
test = Wriggler.new(["character", "content", "name", "title", "test"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -85,9 +85,11 @@ files:
|
|
85
85
|
- Rakefile
|
86
86
|
- bin/console
|
87
87
|
- bin/setup
|
88
|
+
- dirtest/nested_fldr/test5.xml
|
88
89
|
- dirtest/test1.xml
|
89
90
|
- dirtest/test2.xml
|
90
91
|
- dirtest/test3.xml
|
92
|
+
- dirtest/test4.html
|
91
93
|
- lib/wriggler.rb
|
92
94
|
- lib/wriggler/version.rb
|
93
95
|
- test.rb
|