wordlist 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,33 +6,205 @@ module Wordlist
6
6
  module Builders
7
7
  class Website < Builder
8
8
 
9
+ # Proxy to use
10
+ attr_accessor :proxy
11
+
12
+ # User-Agent to use
13
+ attr_accessor :user_agent
14
+
15
+ # Referer URL to use
16
+ attr_accessor :referer
17
+
9
18
  # Host to spider
10
19
  attr_accessor :host
11
20
 
21
+ # HTTP Host Header to use in all requests.
22
+ attr_accessor :host_header
23
+
24
+ # Additional hosts that can be spidered
25
+ attr_reader :hosts
26
+
27
+ # Links to ignore while spidering
28
+ attr_reader :ignore_links
29
+
30
+ # Specifies whether the `content` attribute of `meta` tags will be
31
+ # parsed
32
+ attr_accessor :parse_meta
33
+
34
+ # Specifies whether `title` tags will be parsed
35
+ attr_accessor :parse_title
36
+
37
+ # Specifies whether `h1` tags will be parsed
38
+ attr_accessor :parse_h1
39
+
40
+ # Specifies whether `h2` tags will be parsed
41
+ attr_accessor :parse_h2
42
+
43
+ # Specifies whether `h3` tags will be parsed
44
+ attr_accessor :parse_h3
45
+
46
+ # Specifies whether `h4` tags will be parsed
47
+ attr_accessor :parse_h4
48
+
49
+ # Specifies whether `h5` tags will be parsed
50
+ attr_accessor :parse_h5
51
+
52
+ # Specifies whether `p` tags will be parsed
53
+ attr_accessor :parse_p
54
+
55
+ # Specifies whether `span` tags will be parsed
56
+ attr_accessor :parse_span
57
+
58
+ # Specifies whether the `alt` attributes of `img` tags will be parsed
59
+ attr_accessor :parse_alt
60
+
61
+ # Specifies whether HTML comment tags will be parsed
62
+ attr_accessor :parse_comments
63
+
64
+ # Additional XPath expressions to use to parse spidered pages
65
+ attr_reader :xpaths
66
+
67
+ #
68
+ # Creates a new Website builder object.
69
+ #
70
+ # @param [String] path
71
+ # The path to the word-list to build.
72
+ #
73
+ # @param [Hash] options
74
+ # Additional options.
75
+ #
76
+ # @option options [Hash] :proxy
77
+ # The Hash of proxy information to use.
78
+ #
79
+ # @option options [String] :user_agent
80
+ # The User-Agent string to send with each request.
81
+ #
82
+ # @option options [String] :referer
83
+ # The Referer URL to send with each request.
12
84
  #
13
- # Creates a new Website builder object with the specified _path_
14
- # and _host_. If a _block_ is given, it will be passed the new created
15
- # Website builder object.
85
+ # @option options [String] :host_header
86
+ # The HTTP Host header to use in all requests.
16
87
  #
17
- def initialize(path,host,&block)
18
- @host = host
88
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
89
+ # Links to ignore while spidering.
90
+ #
91
+ # @option options [Boolean] :parse_meta (true)
92
+ # Specifies whether the `content` attribute of `meta` tags will be
93
+ # parsed.
94
+ #
95
+ # @option options [Boolean] :parse_title (true)
96
+ # Specifies whether `title` tags will be parsed.
97
+ #
98
+ # @option options [Boolean] :parse_h1 (true)
99
+ # Specifies whether `h1` tags will be parsed.
100
+ #
101
+ # @option options [Boolean] :parse_h2 (true)
102
+ # Specifies whether `h2` tags will be parsed.
103
+ #
104
+ # @option options [Boolean] :parse_h3 (true)
105
+ # Specifies whether `h3` tags will be parsed.
106
+ #
107
+ # @option options [Boolean] :parse_h4 (true)
108
+ # Specifies whether `h4` tags will be parsed.
109
+ #
110
+ # @option options [Boolean] :parse_h5 (true)
111
+ # Specifies whether `h5` tags will be parsed.
112
+ #
113
+ # @option options [Boolean] :parse_p (true)
114
+ # Specifies whether `p` tags will be parsed.
115
+ #
116
+ # @option options [Boolean] :parse_span (true)
117
+ # Specifies whether `span` tags will be parsed.
118
+ #
119
+ # @option options [Boolean] :parse_alt (true)
120
+ # Specifies whether the `alt` attributes of `img` tags will be
121
+ # parsed.
122
+ #
123
+ # @option options [Boolean] :parse_comments (false)
124
+ # Specifies whether HTML comment tags will be parsed.
125
+ #
126
+ # @option options [Array<String>] :xpaths
127
+ # Additional list of XPath expressions, to use when parsing
128
+ # spidered pages.
129
+ #
130
+ def initialize(path,options={},&block)
131
+ @proxy = options.fetch(:proxy,Spidr.proxy)
132
+ @user_agent = options[:user_agent]
133
+ @referer = options[:referer]
134
+
135
+ @host = options[:host]
136
+ @host_header = options[:host_header]
137
+ @hosts = Array(options[:hosts])
138
+
139
+ @ignore_links = Array(options[:ignore_links])
19
140
 
20
- super(path,&block)
141
+ @parse_meta = options.fetch(:parse_meta,true)
142
+ @parse_title = options.fetch(:parse_title,true)
143
+ @parse_h1 = options.fetch(:parse_h1,true)
144
+ @parse_h2 = options.fetch(:parse_h2,true)
145
+ @parse_h3 = options.fetch(:parse_h3,true)
146
+ @parse_h4 = options.fetch(:parse_h4,true)
147
+ @parse_h5 = options.fetch(:parse_h5,true)
148
+ @parse_p = options.fetch(:parse_p,true)
149
+ @parse_span = options.fetch(:parse_span,true)
150
+ @parse_alt = options.fetch(:parse_alt,true)
151
+ @parse_comments = options.fetch(:parse_comments,false)
152
+
153
+ @xpaths = Array(options[:xpaths])
154
+
155
+ super(path,options,&block)
21
156
  end
22
157
 
23
158
  #
24
- # Builds the word-list file by spidering the +host+ and parsing the
25
- # inner-text from all HTML pages. If a _block_ is given, it will be
26
- # called before all HTML pages on the +host+ have been parsed.
159
+ # Builds the word-list file by spidering the `host` and parsing the
160
+ # inner-text from all HTML pages.
161
+ #
162
+ # @yield [builder]
163
+ # If a block is given, it will be called before all HTML pages on
164
+ # the `host` have been parsed.
165
+ #
166
+ # @yieldparam [Website] builder
167
+ # The website word-list builder.
27
168
  #
28
169
  def build!(&block)
29
170
  super(&block)
30
171
 
31
- Spidr.host(@host) do |spidr|
172
+ options = {
173
+ :proxy => @proxy,
174
+ :user_agent => @user_agent,
175
+ :referer => @referer,
176
+ :hosts => @hosts,
177
+ :ignore_links => @ignore_links
178
+ }
179
+
180
+ xpaths = []
181
+ xpaths << '//meta/@content' if @parse_meta
182
+ xpaths << '//title' if @parse_title
183
+ xpaths << '//h1' if @parse_h1
184
+ xpaths << '//h2' if @parse_h2
185
+ xpaths << '//h3' if @parse_h3
186
+ xpaths << '//h4' if @parse_h4
187
+ xpaths << '//h5' if @parse_h5
188
+ xpaths << '//p' if @parse_p
189
+ xpaths << '//span' if @parse_span
190
+ xpaths << '//img/@alt' if @parse_alt
191
+ xpaths += @xpaths
192
+
193
+ Spidr.host(@host,options) do |spidr|
32
194
  spidr.every_page do |page|
33
195
  if page.html?
34
- page.doc.search('//h1|//h2|//h3|//h4|//h5|//p|//span').each do |element|
35
- parse(element.inner_text)
196
+ if page.doc
197
+ xpaths.each do |xpath|
198
+ page.doc.search(xpath).each do |element|
199
+ parse(element.inner_text)
200
+ end
201
+ end
202
+ end
203
+
204
+ if (@parse_comments && page.doc)
205
+ page.doc.traverse do |element|
206
+ parse(element.inner_text) if element.comment?
207
+ end
36
208
  end
37
209
  end
38
210
  end
@@ -7,8 +7,13 @@ module Wordlist
7
7
  attr_accessor :path
8
8
 
9
9
  #
10
- # Creates a new FlatFile list with the specified _path_ and given
11
- # _options_.
10
+ # Opens a new FlatFile list.
11
+ #
12
+ # @param [String] path
13
+ # The path to the flat file word-list read from.
14
+ #
15
+ # @param [Hash] options
16
+ # Additional options.
12
17
  #
13
18
  def initialize(path,options={},&block)
14
19
  @path = path
@@ -17,9 +22,15 @@ module Wordlist
17
22
  end
18
23
 
19
24
  #
20
- # Enumerates through every word in the flat-file, passing each
21
- # word to the given _block_.
25
+ # Enumerates through every word in the flat-file.
26
+ #
27
+ # @yield [word]
28
+ # The given block will be passed every word from the word-list.
29
+ #
30
+ # @yieldparam [String] word
31
+ # A word from the word-list.
22
32
  #
33
+ # @example
23
34
  # flat_file.each_word do |word|
24
35
  # puts word
25
36
  # end
@@ -13,42 +13,55 @@ module Wordlist
13
13
  attr_accessor :min_length
14
14
 
15
15
  #
16
- # Creates a new List object with the given _options_. If a _block_
17
- # is given, it will be passed the newly created List object.
16
+ # Creates a new List object.
18
17
  #
19
- # _options_ may include the following keys:
20
- # <tt>:max_length</tt>:: The maximum length of words produced by the
21
- # list.
22
- # <tt>:min_length</tt>:: The minimum length of words produced by the
23
- # list.
18
+ # @param [Hash] options
19
+ # Additional options.
24
20
  #
25
- def initialize(options={},&block)
21
+ # @option options [Integer] :max_length
22
+ # The maximum length of words produced by the list.
23
+ #
24
+ # @option options [Integer] :min_length
25
+ # The minimum length of words produced by the list.
26
+ #
27
+ # @yield [list]
28
+ # If a block is given, it will be passed the new list object.
29
+ #
30
+ # @yieldparam [List] list
31
+ # The new list object.
32
+ #
33
+ def initialize(options={})
26
34
  @mutators = []
27
35
 
28
- @max_length = nil
29
- @min_length = 0
30
-
31
- if options[:max_length]
32
- @max_length = options[:max_length]
33
- end
34
-
35
- if options[:min_length]
36
- @min_length = options[:min_length]
37
- end
36
+ @max_length = options[:max_length]
37
+ @min_length = options.fetch(:min_length,0)
38
38
 
39
- block.call(self) if block
39
+ yield self if block_given?
40
40
  end
41
41
 
42
42
  #
43
- # Adds a mutation rule for the specified _pattern_, to be replaced
44
- # using the specified _substitute_. If a _block_ is given, and the
45
- # _substitute_ data omitted, then the _block_ will be used to
46
- # replace data matched by the _pattern_.
43
+ # Adds a mutation rule for the specified pattern, to be replaced
44
+ # using the specified substitute.
47
45
  #
48
- # list.mutate 'o', '0'
46
+ # @param [String, Regexp] pattern
47
+ # The pattern to recognize text to mutate.
49
48
  #
50
- # list.mutate '0', 0x41
49
+ # @param [String, Integer, nil] substitute
50
+ # The optional text to replace recognized text.
51
51
  #
52
+ # @yield [match]
53
+ # If a block is given, it will be passed the recognized text to be
54
+ # mutated. The return value of the block will be used to replace
55
+ # the recognized text.
56
+ #
57
+ # @yieldparam [String] match
58
+ # The recognized text to be mutated.
59
+ #
60
+ # @example
61
+ # list.mutate 'o', '0'
62
+ #
63
+ # list.mutate '0', 0x41
64
+ #
52
65
  # list.mutate(/[oO]/) do |match|
53
66
  # match.swapcase
54
67
  # end
@@ -58,10 +71,15 @@ module Wordlist
58
71
  end
59
72
 
60
73
  #
61
- # Enumerate through every word in the list, passing each word to
62
- # the given block. By default this method passes nothing to the given
63
- # _block_.
74
+ # Enumerate through every word in the list.
75
+ #
76
+ # @yield [word]
77
+ # The given block will be passed each word in the list.
64
78
  #
79
+ # @yieldparam [String] word
80
+ # A word from the list.
81
+ #
82
+ # @example
65
83
  # list.each_word do |word|
66
84
  # puts word
67
85
  # end
@@ -70,9 +88,15 @@ module Wordlist
70
88
  end
71
89
 
72
90
  #
73
- # Enumerates through every unique word in the list, passing each
74
- # unique word to the given block.
91
+ # Enumerates through every unique word in the list.
92
+ #
93
+ # @yield [word]
94
+ # The given block will be passed each unique word in the list.
95
+ #
96
+ # @yieldparam [String] word
97
+ # A unique word from the list.
75
98
  #
99
+ # @example
76
100
  # list.each_unique do |word|
77
101
  # puts word
78
102
  # end
@@ -91,9 +115,16 @@ module Wordlist
91
115
 
92
116
  #
93
117
  # Enumerates through every unique mutation, of every unique word, using
94
- # the mutator rules define for the list. Every possible unique mutation
95
- # will be passed to the given _block_.
118
+ # the mutator rules define for the list.
119
+ #
120
+ # @yield [word]
121
+ # The given block will be passed every mutation of every unique
122
+ # word in the list.
123
+ #
124
+ # @yieldparam [String] word
125
+ # A mutation of a unique word from the list.
96
126
  #
127
+ # @example
97
128
  # list.each_mutation do |word|
98
129
  # puts word
99
130
  # end
@@ -10,18 +10,33 @@ module Wordlist
10
10
  attr_accessor :substitute
11
11
 
12
12
  #
13
- # Creates a new Mutator with the specified _pattern_ and _substitute_
14
- # data. If a _block_ is given, and the _substitute_ data is omitted, then
15
- # the _block_ will be used to replace data matched by the _pattern_.
13
+ # Creates a new Mutator object.
14
+ #
15
+ # @param [String, Regexp] pattern
16
+ # The pattern which recognizes text to mutate.
17
+ #
18
+ # @param [String, Integer] substitute
19
+ # The optional text to replace recognized text.
20
+ #
21
+ # @yield [match]
22
+ # If a block is given, it will be used to mutate recognized text.
23
+ #
24
+ # @yieldparam [String] match
25
+ # The match text to mutate.
16
26
  #
17
27
  def initialize(pattern,substitute=nil,&block)
18
- @pattern = pattern
28
+ @pattern = pattern
19
29
  @substitute = (substitute || block)
20
30
  end
21
31
 
22
32
  #
23
- # Replaces the specified _matched_ data using the +substitute+, which
24
- # may be either a String, Integer or Proc.
33
+ # Mutates the given text.
34
+ #
35
+ # @param [String] matched
36
+ # The recognized text to be mutated.
37
+ #
38
+ # @return [String]
39
+ # The mutated text.
25
40
  #
26
41
  def replace(matched)
27
42
  result = if @substitute.kind_of?(Proc)
@@ -40,9 +55,20 @@ module Wordlist
40
55
  end
41
56
 
42
57
  #
43
- # Performs every possible replacement of data, which matches the
44
- # mutators +pattern+ using the replace method, on the specified _word_
45
- # passing each variation to the given _block_.
58
+ # Enumerates over every possible mutation of the given word.
59
+ #
60
+ # @param [String] word
61
+ # The word to mutate.
62
+ #
63
+ # @yield [mutation]
64
+ # The given block will be passed every possible mutation of the
65
+ # given word.
66
+ #
67
+ # @yieldparam [String] mutation
68
+ # One possible mutation of the given word.
69
+ #
70
+ # @return [String]
71
+ # The original word.
46
72
  #
47
73
  def each(word)
48
74
  choices = 0
@@ -76,6 +102,9 @@ module Wordlist
76
102
  #
77
103
  # Inspects the mutator.
78
104
  #
105
+ # @return [String]
106
+ # The inspected mutator.
107
+ #
79
108
  def inspect
80
109
  "#{@pattern.inspect} -> #{@substitute.inspect}"
81
110
  end