wriggler 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -8
- data/lib/wriggler/version.rb +1 -1
- data/lib/wriggler.rb +12 -2
- metadata +1 -7
- data/dirtest/nested_fldr/test5.xml +0 -1
- data/dirtest/test1.xml +0 -31
- data/dirtest/test2.xml +0 -30
- data/dirtest/test3.xml +0 -30
- data/dirtest/test4.html +0 -7
- data/test.rb +0 -93
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b1d083c4ddf26508f6ca9b68f5574bf1491eba46
|
4
|
+
data.tar.gz: 2f347c815b35996afbec369bc0f39eb56cb2c739
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3fab4d15b405776477b04317e34f757971ab4f6c14cb1041ba2040e513bbcb8a0cc837427372843e77365e322d82ae72f44aa54fbd0c724fa5e224c276b48b9
|
7
|
+
data.tar.gz: 8d8a1aace53b38470d54f0dade3708e0645211aaaa25d6e09f78175bccb93c9cf24af2e44735e3311b929c98ff0a2cf34a9194b26342b063579921a4c896bc99
|
data/README.md
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# Wriggler
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
Wriggler was created to serve and the crawler for a search engine, moving its way through HTML files and grabbing data based on pre determined tags then making/storing the data in a number of specifically created CSV files.
|
3
|
+
Wriggler was created to serve and the crawler for a search engine, moving its way through HTML and/or XML files and grabbing data based on pre determined tags then making/storing the data in a specifically created CSV file. Wriggler acts similarly t0 a spider, but was designed to be used with any number of local files, not as an actual web scraper.
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
@@ -22,18 +20,20 @@ Or install it yourself as:
|
|
22
20
|
|
23
21
|
## Usage
|
24
22
|
|
25
|
-
|
26
|
-
|
27
|
-
## Development
|
23
|
+
You only need to run one command to use Wriggler, run:
|
28
24
|
|
29
|
-
|
25
|
+
```ruby
|
26
|
+
Wriggler.crawl([array, of, HTML/XML, tags], directory)
|
27
|
+
```
|
30
28
|
|
31
|
-
|
29
|
+
Note: The directory in this should be the top level directory that your HTML/XML files are in. Wriggler will account for any nested directories within this directory that also contain HTML/XML files and at the end of it running will save a new file named "tag_content.csv" to this directory
|
32
30
|
|
33
31
|
## Contributing
|
34
32
|
|
35
33
|
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/wriggler. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
36
34
|
|
35
|
+
On top of that, please contribute. I built this for a very specific reason, but I would very much like to see it become something bigger, so if you can assist with that please do!
|
36
|
+
|
37
37
|
|
38
38
|
## License
|
39
39
|
|
data/lib/wriggler/version.rb
CHANGED
data/lib/wriggler.rb
CHANGED
@@ -92,8 +92,18 @@ module Wriggler
|
|
92
92
|
end
|
93
93
|
end
|
94
94
|
|
95
|
+
require 'CSV'
|
96
|
+
|
95
97
|
module Writer
|
96
|
-
def write(content)
|
97
|
-
|
98
|
+
def self.write(content)
|
99
|
+
#Write to a CSV file now
|
100
|
+
column_names = content.keys
|
101
|
+
s = CSV.generate do |csv|
|
102
|
+
csv << column_names
|
103
|
+
content.keys.each do |key|
|
104
|
+
csv << content.fetch(key)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
File.write('tag_content.csv', s)
|
98
108
|
end
|
99
109
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wriggler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elliott Young
|
@@ -85,14 +85,8 @@ files:
|
|
85
85
|
- Rakefile
|
86
86
|
- bin/console
|
87
87
|
- bin/setup
|
88
|
-
- dirtest/nested_fldr/test5.xml
|
89
|
-
- dirtest/test1.xml
|
90
|
-
- dirtest/test2.xml
|
91
|
-
- dirtest/test3.xml
|
92
|
-
- dirtest/test4.html
|
93
88
|
- lib/wriggler.rb
|
94
89
|
- lib/wriggler/version.rb
|
95
|
-
- test.rb
|
96
90
|
- wriggler.gemspec
|
97
91
|
homepage: https://github.com/ElliottAYoung/wriggler
|
98
92
|
licenses:
|
@@ -1 +0,0 @@
|
|
1
|
-
<test>If this appears it works</test>
|
data/dirtest/test1.xml
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<test>This is different</test>
|
5
|
-
<name>Married with Children</name>
|
6
|
-
<characters>
|
7
|
-
<character>Al Bundy</character>
|
8
|
-
<character>Bud Bundy</character>
|
9
|
-
<character>Marcy Darcy</character>
|
10
|
-
</characters>
|
11
|
-
</sitcom>
|
12
|
-
<sitcom>
|
13
|
-
<name>Perfect Strangers</name>
|
14
|
-
<characters>
|
15
|
-
<character>Larry Appleton</character>
|
16
|
-
<character>Balki Bartokomous</character>
|
17
|
-
</characters>
|
18
|
-
</sitcom>
|
19
|
-
</sitcoms>
|
20
|
-
<dramas>
|
21
|
-
<drama>
|
22
|
-
<name>The A-Team</name>
|
23
|
-
<characters>
|
24
|
-
<character>John "Hannibal" Smith</character>
|
25
|
-
<character>Templeton "Face" Peck</character>
|
26
|
-
<character>"B.A." Baracus</character>
|
27
|
-
<character>"Howling Mad" Murdock</character>
|
28
|
-
</characters>
|
29
|
-
</drama>
|
30
|
-
</dramas>
|
31
|
-
</root>
|
data/dirtest/test2.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|
data/dirtest/test3.xml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<sitcoms>
|
3
|
-
<sitcom>
|
4
|
-
<name>Married with Children</name>
|
5
|
-
<characters>
|
6
|
-
<character>Al Bundy</character>
|
7
|
-
<character>Bud Bundy</character>
|
8
|
-
<character>Marcy Darcy</character>
|
9
|
-
</characters>
|
10
|
-
</sitcom>
|
11
|
-
<sitcom>
|
12
|
-
<name>Perfect Strangers</name>
|
13
|
-
<characters>
|
14
|
-
<character>Larry Appleton</character>
|
15
|
-
<character>Balki Bartokomous</character>
|
16
|
-
</characters>
|
17
|
-
</sitcom>
|
18
|
-
</sitcoms>
|
19
|
-
<dramas>
|
20
|
-
<drama>
|
21
|
-
<name>The A-Team</name>
|
22
|
-
<characters>
|
23
|
-
<character>John "Hannibal" Smith</character>
|
24
|
-
<character>Templeton "Face" Peck</character>
|
25
|
-
<character>"B.A." Baracus</character>
|
26
|
-
<character>"Howling Mad" Murdock</character>
|
27
|
-
</characters>
|
28
|
-
</drama>
|
29
|
-
</dramas>
|
30
|
-
</root>
|
data/dirtest/test4.html
DELETED
data/test.rb
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
require "find"
|
3
|
-
|
4
|
-
class Wriggler
|
5
|
-
def initialize(tags=[], directory="", subdirectories=true)
|
6
|
-
@content = Hash[tags.map {|k| [k, []]}] #Hash with content
|
7
|
-
@directory = directory #Current top-level directory
|
8
|
-
@subdirectories = subdirectories #Default true for the existence of subdirs
|
9
|
-
|
10
|
-
navigate_directory
|
11
|
-
p @content
|
12
|
-
# Writer.write(@content)
|
13
|
-
end
|
14
|
-
|
15
|
-
private
|
16
|
-
|
17
|
-
def navigate_directory
|
18
|
-
#Set the cwd to the given dir send to gather all nested files from there
|
19
|
-
Dir.chdir(@directory)
|
20
|
-
open_files(gather_files)
|
21
|
-
end
|
22
|
-
|
23
|
-
def gather_files
|
24
|
-
#Gathers all of the HTML or XML files from this and all subdirectories into an array
|
25
|
-
file_array = []
|
26
|
-
Find.find(@directory) do |f|
|
27
|
-
file_array << f if f.match(/\.xml\Z/) || f.match(/\.html\Z/)
|
28
|
-
end
|
29
|
-
file_array
|
30
|
-
end
|
31
|
-
|
32
|
-
def open_files(file_array)
|
33
|
-
file_array.each do |file|
|
34
|
-
open_next_file(file)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def open_next_file(file)
|
39
|
-
#Opens the next file on the list, depending on the extension passes it to HTML or XML
|
40
|
-
f = File.open(file)
|
41
|
-
|
42
|
-
if is_HTML?(file)
|
43
|
-
set_HTML(f)
|
44
|
-
elsif is_XML?(file)
|
45
|
-
set_XML(f)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
def is_HTML?(file)
|
50
|
-
#Determines, using a regex check, if it is an HTML file
|
51
|
-
file =~ /.html/
|
52
|
-
end
|
53
|
-
|
54
|
-
def is_XML?(file)
|
55
|
-
#Determines, using a regex check, if it is an XML file
|
56
|
-
file =~ /.xml/
|
57
|
-
end
|
58
|
-
|
59
|
-
def set_HTML(file)
|
60
|
-
#Set the HTML file into Nokogiri for crawling
|
61
|
-
doc = Nokogiri::HTML(file)
|
62
|
-
crawl_file(doc)
|
63
|
-
end
|
64
|
-
|
65
|
-
def set_XML(file)
|
66
|
-
#Set the XML file into Nokogiri for crawling
|
67
|
-
doc = Nokogiri::XML(file)
|
68
|
-
crawl_file(doc)
|
69
|
-
end
|
70
|
-
|
71
|
-
def crawl_file(doc)
|
72
|
-
#Crawl the Nokogiri Object for the file
|
73
|
-
@content.each_key do |key|
|
74
|
-
arr = []
|
75
|
-
if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present
|
76
|
-
doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
|
77
|
-
end
|
78
|
-
fill_content(arr, key)
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
def sanitize(text)
|
83
|
-
#Removes any escaped quotes, replaces them
|
84
|
-
text.gsub(/"/, "'").lstrip.chomp
|
85
|
-
end
|
86
|
-
|
87
|
-
def fill_content(arr, key)
|
88
|
-
#Doesn't shovel if there is no content found for the specific tag
|
89
|
-
!arr.empty? ? (@content.fetch(key) << arr) : nil
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
test = Wriggler.new(["character", "content", "name", "title", "test"], "/Users/47900/Desktop/Ruby/wriggler/dirtest", false)
|