wriggler 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -8
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +12 -2
- metadata +1 -7
- data/dirtest/nested_fldr/test5.xml +0 -1
- data/dirtest/test1.xml +0 -31
- data/dirtest/test2.xml +0 -30
- data/dirtest/test3.xml +0 -30
- data/dirtest/test4.html +0 -7
- data/test.rb +0 -93
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b1d083c4ddf26508f6ca9b68f5574bf1491eba46
|
4
|
+
data.tar.gz: 2f347c815b35996afbec369bc0f39eb56cb2c739
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3fab4d15b405776477b04317e34f757971ab4f6c14cb1041ba2040e513bbcb8a0cc837427372843e77365e322d82ae72f44aa54fbd0c724fa5e224c276b48b9
|
7
|
+
data.tar.gz: 8d8a1aace53b38470d54f0dade3708e0645211aaaa25d6e09f78175bccb93c9cf24af2e44735e3311b929c98ff0a2cf34a9194b26342b063579921a4c896bc99
|
data/README.md
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# Wriggler
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
Wriggler was created to serve and the crawler for a search engine, moving its way through HTML files and grabbing data based on pre determined tags then making/storing the data in a number of specifically created CSV files.
|
3
|
+
Wriggler was created to serve and the crawler for a search engine, moving its way through HTML and/or XML files and grabbing data based on pre determined tags then making/storing the data in a specifically created CSV file. Wriggler acts similarly t0 a spider, but was designed to be used with any number of local files, not as an actual web scraper.
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
@@ -22,18 +20,20 @@ Or install it yourself as:
|
|
22
20
|
|
23
21
|
## Usage
|
24
22
|
|
25
|
-
|
26
|
-
|
27
|
-
## Development
|
23
|
+
You only need to run one command to use Wriggler, run:
|
28
24
|
|
29
|
-
|
25
|
+
```ruby
|
26
|
+
Wriggler.crawl([array, of, HTML/XML, tags], directory)
|
27
|
+
```
|
30
28
|
|
31
|
-
|
29
|
+
Note: The directory in this should be the top level directory that your HTML/XML files are in. Wriggler will account for any nested directories within this directory that also contain HTML/XML files and at the end of it running will save a new file named "tag_content.csv" to this directory
|
32
30
|
|
33
31
|
## Contributing
|
34
32
|
|
35
33
|
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/wriggler. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
36
34
|
|
35
|
+
On top of that, please contribute. I built this for a very specific reason, but I would very much like to see it become something bigger, so if you can assist with that please do!
|
36
|
+
|
37
37
|
|
38
38
|
## License
|
39
39
|
|
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -92,8 +92,18 @@ module Wriggler
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
+
require 'CSV'
|
96
|
+
|
95
97
|
module Writer
|
96
|
-
def write(content)
|
97
|
-
|
98
|
+
def self.write(content)
|
99
|
+
#Write to a CSV file now
|
100
|
+
column_names = content.keys
|
101
|
+
s = CSV.generate do |csv|
|
102
|
+
csv << column_names
|
103
|
+
content.keys.each do |key|
|
104
|
+
csv << content.fetch(key)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
File.write('tag_content.csv', s)
|
98
108
|
end
|
99
109
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
@@ -85,14 +85,8 @@ files:
|
|
85
85
|
- Rakefile
|
86
86
|
- bin/console
|
87
87
|
- bin/setup
|
88
|
-
- dirtest/nested_fldr/test5.xml
|
89
|
-
- dirtest/test1.xml
|
90
|
-
- dirtest/test2.xml
|
91
|
-
- dirtest/test3.xml
|
92
|
-
- dirtest/test4.html
|
93
88
|
- lib/wriggler.rb
|
94
89
|
- lib/wriggler/version.rb
|
95
|
-
- test.rb
|
96
90
|
- wriggler.gemspec
|
97
91
|
homepage: https://github.com/ElliottAYoung/wriggler
|
98
92
|
licenses:
|
@@ -1 +0,0 @@
|
|
1
|
-
<test>If this appears it works</test>
|
data/dirtest/test1.xml
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<test>This is different</test>
|
5
|
-
<name>Married with Children</name>
|
6
|
-
<characters>
|
7
|
-
<character>Al Bundy</character>
|
8
|
-
<character>Bud Bundy</character>
|
9
|
-
<character>Marcy Darcy</character>
|
10
|
-
</characters>
|
11
|
-
</sitcom>
|
12
|
-
<sitcom>
|
13
|
-
<name>Perfect Strangers</name>
|
14
|
-
<characters>
|
15
|
-
<character>Larry Appleton</character>
|
16
|
-
<character>Balki Bartokomous</character>
|
17
|
-
</characters>
|
18
|
-
</sitcom>
|
19
|
-
</sitcoms>
|
20
|
-
<dramas>
|
21
|
-
<drama>
|
22
|
-
<name>The A-Team</name>
|
23
|
-
<characters>
|
24
|
-
<character>John "Hannibal" Smith</character>
|
25
|
-
<character>Templeton "Face" Peck</character>
|
26
|
-
<character>"B.A." Baracus</character>
|
27
|
-
<character>"Howling Mad" Murdock</character>
|
28
|
-
</characters>
|
29
|
-
</drama>
|
30
|
-
</dramas>
|
31
|
-
</root>
|
data/dirtest/test2.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|
data/dirtest/test3.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|
data/dirtest/test4.html
DELETED
data/test.rb
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
require "find"
|
3
|
-
|
4
|
-
class Wriggler
|
5
|
-
def initialize(tags=[], directory="", subdirectories=true)
|
6
|
-
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
7
|
-
@directory = directory #Current top-level directory
|
8
|
-
@subdirectories = subdirectories #Default true for the existence of subdirs
|
9
|
-
|
10
|
-
navigate_directory
|
11
|
-
p @content
|
12
|
-
# Writer.write(@content)
|
13
|
-
end
|
14
|
-
|
15
|
-
private
|
16
|
-
|
17
|
-
def navigate_directory
|
18
|
-
#Set the cwd to the given dir send to gather all nested files from there
|
19
|
-
Dir.chdir(@directory)
|
20
|
-
open_files(gather_files)
|
21
|
-
end
|
22
|
-
|
23
|
-
def gather_files
|
24
|
-
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
25
|
-
file_array = []
|
26
|
-
Find.find(@directory) do |f|
|
27
|
-
file_array << f if f.match(/\.xml\Z/) || f.match(/\.html\Z/)
|
28
|
-
end
|
29
|
-
file_array
|
30
|
-
end
|
31
|
-
|
32
|
-
def open_files(file_array)
|
33
|
-
file_array.each do |file|
|
34
|
-
open_next_file(file)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def open_next_file(file)
|
39
|
-
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
40
|
-
f = File.open(file)
|
41
|
-
|
42
|
-
if is_HTML?(file)
|
43
|
-
set_HTML(f)
|
44
|
-
elsif is_XML?(file)
|
45
|
-
set_XML(f)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def is_HTML?(file)
|
50
|
-
#Determines, using a regex check, if it is an HTML file
|
51
|
-
file =~ /.html/
|
52
|
-
end
|
53
|
-
|
54
|
-
def is_XML?(file)
|
55
|
-
#Determines, using a regex check, if it is an XML file
|
56
|
-
file =~ /.xml/
|
57
|
-
end
|
58
|
-
|
59
|
-
def set_HTML(file)
|
60
|
-
#Set the HTML file into Nokogiri for crawling
|
61
|
-
doc = Nokogiri::HTML(file)
|
62
|
-
crawl_file(doc)
|
63
|
-
end
|
64
|
-
|
65
|
-
def set_XML(file)
|
66
|
-
#Set the XML file into Nokogiri for crawling
|
67
|
-
doc = Nokogiri::XML(file)
|
68
|
-
crawl_file(doc)
|
69
|
-
end
|
70
|
-
|
71
|
-
def crawl_file(doc)
|
72
|
-
#Crawl the Nokogiri Object for the file
|
73
|
-
@content.each_key do |key|
|
74
|
-
arr = []
|
75
|
-
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
76
|
-
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
77
|
-
end
|
78
|
-
fill_content(arr, key)
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
def sanitize(text)
|
83
|
-
#Removes any escaped quotes, replaces them
|
84
|
-
text.gsub(/"/, "'").lstrip.chomp
|
85
|
-
end
|
86
|
-
|
87
|
-
def fill_content(arr, key)
|
88
|
-
#Doesn't shovel if there is no content found for the specific tag
|
89
|
-
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
test = Wriggler.new(["character", "content", "name", "title", "test"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|