wriggler 0.1.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/dirtest/nested_fldr/test5.xml +1 -0
- data/dirtest/test1.xml +1 -0
- data/dirtest/test4.html +7 -0
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +40 -21
- data/test.rb +60 -58
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d741f12caa9d0cae037e2689e76ff0aa6adee13
|
4
|
+
data.tar.gz: d84a237b2f9c49ef76df0a1fa7db330d32d031ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fd896183312ea3a2f9ab5e00836dfaf12f7af1486796deedd2f80f1c3f851ae437d29d8c48566a8f89c8ff04148b51c797ad82a8518fd65fb025178f75c80c0
|
7
|
+
data.tar.gz: 0aa7b338518a6285e5bb3aac82ff401b0157a3b72f805b49f2b066851e1aa81f22a7f03924db2625ea50eece17c6492942424664e380fa2c50ab89d5dd0f0b8d
|
@@ -0,0 +1 @@
|
|
1
|
+
<test>If this appears it works</test>
|
data/dirtest/test1.xml
CHANGED
data/dirtest/test4.html
ADDED
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -1,32 +1,43 @@
|
|
1
1
|
require "wriggler/version"
|
2
2
|
require "nokogiri"
|
3
|
+
require "find"
|
3
4
|
|
4
5
|
module Wriggler
|
5
6
|
attr_reader :content, :directory
|
6
7
|
|
7
|
-
def crawl(tags=[], directory=""
|
8
|
-
|
9
|
-
|
10
|
-
@directory = directory #Directory to grab files from
|
8
|
+
def self.crawl(tags=[], directory="")
|
9
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
10
|
+
@directory = directory #Current top-level directory
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
12
|
+
navigate_directory
|
13
|
+
Writer.write(@content)
|
14
|
+
end
|
15
15
|
|
16
16
|
private
|
17
17
|
|
18
|
-
def navigate_directory
|
18
|
+
def self.navigate_directory
|
19
19
|
#Set the cwd to the given dir send to gather all nested files from there
|
20
20
|
Dir.chdir(@directory)
|
21
|
-
gather_files
|
21
|
+
open_files(gather_files)
|
22
22
|
end
|
23
23
|
|
24
|
-
def gather_files
|
25
|
-
#Gathers all of the HTML or XML files from this and all subdirectories
|
24
|
+
def self.gather_files
|
25
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
26
|
+
file_array = []
|
27
|
+
Find.find(@directory) do |file|
|
28
|
+
file_array << file if file.match(/\.xml\Z/) || file.match(/\.html\Z/)
|
29
|
+
end
|
30
|
+
file_array
|
31
|
+
end
|
26
32
|
|
33
|
+
def self.open_files(file_array)
|
34
|
+
#Opens all the files in the file_array
|
35
|
+
file_array.each do |file|
|
36
|
+
open_next_file(file)
|
37
|
+
end
|
27
38
|
end
|
28
39
|
|
29
|
-
def open_next_file(file)
|
40
|
+
def self.open_next_file(file)
|
30
41
|
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
31
42
|
f = File.open(file)
|
32
43
|
|
@@ -37,44 +48,52 @@ module Wriggler
|
|
37
48
|
end
|
38
49
|
end
|
39
50
|
|
40
|
-
def is_HTML?(file)
|
51
|
+
def self.is_HTML?(file)
|
41
52
|
#Determines, using a regex check, if it is an HTML file
|
42
53
|
file =~ /.html/
|
43
54
|
end
|
44
55
|
|
45
|
-
def is_XML?(file)
|
56
|
+
def self.is_XML?(file)
|
46
57
|
#Determines, using a regex check, if it is an XML file
|
47
58
|
file =~ /.xml/
|
48
59
|
end
|
49
60
|
|
50
|
-
def set_HTML(file)
|
61
|
+
def self.set_HTML(file)
|
51
62
|
#Set the HTML file into Nokogiri for crawling
|
52
63
|
doc = Nokogiri::HTML(file)
|
53
64
|
crawl_file(doc)
|
54
65
|
end
|
55
66
|
|
56
|
-
def set_XML(file)
|
67
|
+
def self.set_XML(file)
|
57
68
|
#Set the XML file into Nokogiri for crawling
|
58
69
|
doc = Nokogiri::XML(file)
|
59
70
|
crawl_file(doc)
|
60
71
|
end
|
61
72
|
|
62
|
-
def crawl_file(doc)
|
73
|
+
def self.crawl_file(doc)
|
63
74
|
#Crawl the Nokogiri Object for the file
|
64
75
|
@content.each_key do |key|
|
76
|
+
arr = []
|
65
77
|
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
66
|
-
doc.xpath("//#{key}").map{ |tag|
|
78
|
+
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
67
79
|
end
|
80
|
+
fill_content(arr, key)
|
68
81
|
end
|
69
82
|
end
|
70
83
|
|
71
|
-
def sanitize(text)
|
84
|
+
def self.sanitize(text)
|
72
85
|
#Removes any escaped quotes, replaces them
|
73
|
-
text.gsub(/"/, "'")
|
86
|
+
text.gsub(/"/, "'").lstrip.chomp
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.fill_content(arr, key)
|
90
|
+
#Doesn't shovel if there is no content found for the specific tag
|
91
|
+
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
74
92
|
end
|
75
93
|
end
|
76
94
|
|
77
95
|
module Writer
|
78
|
-
def
|
96
|
+
def write(content)
|
97
|
+
@content = content
|
79
98
|
end
|
80
99
|
end
|
data/test.rb
CHANGED
@@ -1,91 +1,93 @@
|
|
1
1
|
require "nokogiri"
|
2
|
+
require "find"
|
2
3
|
|
3
4
|
class Wriggler
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def initialize(tags=[], directory="", subdirectories=true)
|
6
|
+
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
7
|
+
@directory = directory #Current top-level directory
|
8
|
+
@subdirectories = subdirectories #Default true for the existence of subdirs
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
navigate_directory
|
11
|
+
p @content
|
12
|
+
# Writer.write(@content)
|
13
|
+
end
|
12
14
|
|
13
15
|
private
|
14
16
|
|
15
17
|
def navigate_directory
|
16
|
-
|
17
|
-
|
18
|
-
|
18
|
+
#Set the cwd to the given dir send to gather all nested files from there
|
19
|
+
Dir.chdir(@directory)
|
20
|
+
open_files(gather_files)
|
19
21
|
end
|
20
22
|
|
21
23
|
def gather_files
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
puts ""
|
35
|
-
open_next_file("test3.xml")
|
36
|
-
puts "=============="
|
37
|
-
puts "3:"
|
38
|
-
p @content
|
39
|
-
puts "=============="
|
40
|
-
puts ""
|
24
|
+
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
25
|
+
file_array = []
|
26
|
+
Find.find(@directory) do |f|
|
27
|
+
file_array << f if f.match(/\.xml\Z/) || f.match(/\.html\Z/)
|
28
|
+
end
|
29
|
+
file_array
|
30
|
+
end
|
31
|
+
|
32
|
+
def open_files(file_array)
|
33
|
+
file_array.each do |file|
|
34
|
+
open_next_file(file)
|
35
|
+
end
|
41
36
|
end
|
42
37
|
|
43
38
|
def open_next_file(file)
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
39
|
+
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
40
|
+
f = File.open(file)
|
41
|
+
|
42
|
+
if is_HTML?(file)
|
43
|
+
set_HTML(f)
|
44
|
+
elsif is_XML?(file)
|
45
|
+
set_XML(f)
|
46
|
+
end
|
52
47
|
end
|
53
48
|
|
54
|
-
def
|
55
|
-
|
56
|
-
|
49
|
+
def is_HTML?(file)
|
50
|
+
#Determines, using a regex check, if it is an HTML file
|
51
|
+
file =~ /.html/
|
57
52
|
end
|
58
53
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
54
|
+
def is_XML?(file)
|
55
|
+
#Determines, using a regex check, if it is an XML file
|
56
|
+
file =~ /.xml/
|
62
57
|
end
|
63
58
|
|
64
59
|
def set_HTML(file)
|
65
|
-
|
66
|
-
|
67
|
-
|
60
|
+
#Set the HTML file into Nokogiri for crawling
|
61
|
+
doc = Nokogiri::HTML(file)
|
62
|
+
crawl_file(doc)
|
68
63
|
end
|
69
64
|
|
70
65
|
def set_XML(file)
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
#Set the XML file into Nokogiri for crawling
|
67
|
+
doc = Nokogiri::XML(file)
|
68
|
+
crawl_file(doc)
|
74
69
|
end
|
75
70
|
|
76
71
|
def crawl_file(doc)
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
72
|
+
#Crawl the Nokogiri Object for the file
|
73
|
+
@content.each_key do |key|
|
74
|
+
arr = []
|
75
|
+
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
76
|
+
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
77
|
+
end
|
78
|
+
fill_content(arr, key)
|
79
|
+
end
|
83
80
|
end
|
84
81
|
|
85
82
|
def sanitize(text)
|
86
|
-
|
87
|
-
|
83
|
+
#Removes any escaped quotes, replaces them
|
84
|
+
text.gsub(/"/, "'").lstrip.chomp
|
85
|
+
end
|
86
|
+
|
87
|
+
def fill_content(arr, key)
|
88
|
+
#Doesn't shovel if there is no content found for the specific tag
|
89
|
+
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
88
90
|
end
|
89
91
|
end
|
90
92
|
|
91
|
-
test = Wriggler.new(["character", "content", "name", "title"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|
93
|
+
test = Wriggler.new(["character", "content", "name", "title", "test"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -85,9 +85,11 @@ files:
|
|
85
85
|
- Rakefile
|
86
86
|
- bin/console
|
87
87
|
- bin/setup
|
88
|
+
- dirtest/nested_fldr/test5.xml
|
88
89
|
- dirtest/test1.xml
|
89
90
|
- dirtest/test2.xml
|
90
91
|
- dirtest/test3.xml
|
92
|
+
- dirtest/test4.html
|
91
93
|
- lib/wriggler.rb
|
92
94
|
- lib/wriggler/version.rb
|
93
95
|
- test.rb
|