wordlist 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,33 +6,205 @@ module Wordlist
6
6
  module Builders
7
7
  class Website < Builder
8
8
 
9
+ # Proxy to use
10
+ attr_accessor :proxy
11
+
12
+ # User-Agent to use
13
+ attr_accessor :user_agent
14
+
15
+ # Referer URL to use
16
+ attr_accessor :referer
17
+
9
18
  # Host to spider
10
19
  attr_accessor :host
11
20
 
21
+ # HTTP Host Header to use in all requests.
22
+ attr_accessor :host_header
23
+
24
+ # Additional hosts that can be spidered
25
+ attr_reader :hosts
26
+
27
+ # Links to ignore while spidering
28
+ attr_reader :ignore_links
29
+
30
+ # Specifies whether the `content` attribute of `meta` tags will be
31
+ # parsed
32
+ attr_accessor :parse_meta
33
+
34
+ # Specifies whether `title` tags will be parsed
35
+ attr_accessor :parse_title
36
+
37
+ # Specifies whether `h1` tags will be parsed
38
+ attr_accessor :parse_h1
39
+
40
+ # Specifies whether `h2` tags will be parsed
41
+ attr_accessor :parse_h2
42
+
43
+ # Specifies whether `h3` tags will be parsed
44
+ attr_accessor :parse_h3
45
+
46
+ # Specifies whether `h4` tags will be parsed
47
+ attr_accessor :parse_h4
48
+
49
+ # Specifies whether `h5` tags will be parsed
50
+ attr_accessor :parse_h5
51
+
52
+ # Specifies whether `p` tags will be parsed
53
+ attr_accessor :parse_p
54
+
55
+ # Specifies whether `span` tags will be parsed
56
+ attr_accessor :parse_span
57
+
58
+ # Specifies whether the `alt` attributes of `img` tags will be parsed
59
+ attr_accessor :parse_alt
60
+
61
+ # Specifies whether HTML comment tags will be parsed
62
+ attr_accessor :parse_comments
63
+
64
+ # Additional XPath expressions to use to parse spidered pages
65
+ attr_reader :xpaths
66
+
67
+ #
68
+ # Creates a new Website builder object.
69
+ #
70
+ # @param [String] path
71
+ # The path to the word-list to build.
72
+ #
73
+ # @param [Hash] options
74
+ # Additional options.
75
+ #
76
+ # @option options [Hash] :proxy
77
+ # The Hash of proxy information to use.
78
+ #
79
+ # @option options [String] :user_agent
80
+ # The User-Agent string to send with each request.
81
+ #
82
+ # @option options [String] :referer
83
+ # The Referer URL to send with each request.
12
84
  #
13
- # Creates a new Website builder object with the specified _path_
14
- # and _host_. If a _block_ is given, it will be passed the new created
15
- # Website builder object.
85
+ # @option options [String] :host_header
86
+ # The HTTP Host header to use in all requests.
16
87
  #
17
- def initialize(path,host,&block)
18
- @host = host
88
+ # @option options [Array<String, Regexp, Proc>] :ignore_links
89
+ # Links to ignore while spidering.
90
+ #
91
+ # @option options [Boolean] :parse_meta (true)
92
+ # Specifies whether the `content` attribute of `meta` tags will be
93
+ # parsed.
94
+ #
95
+ # @option options [Boolean] :parse_title (true)
96
+ # Specifies whether `title` tags will be parsed.
97
+ #
98
+ # @option options [Boolean] :parse_h1 (true)
99
+ # Specifies whether `h1` tags will be parsed.
100
+ #
101
+ # @option options [Boolean] :parse_h2 (true)
102
+ # Specifies whether `h2` tags will be parsed.
103
+ #
104
+ # @option options [Boolean] :parse_h3 (true)
105
+ # Specifies whether `h3` tags will be parsed.
106
+ #
107
+ # @option options [Boolean] :parse_h4 (true)
108
+ # Specifies whether `h4` tags will be parsed.
109
+ #
110
+ # @option options [Boolean] :parse_h5 (true)
111
+ # Specifies whether `h5` tags will be parsed.
112
+ #
113
+ # @option options [Boolean] :parse_p (true)
114
+ # Specifies whether `p` tags will be parsed.
115
+ #
116
+ # @option options [Boolean] :parse_span (true)
117
+ # Specifies whether `span` tags will be parsed.
118
+ #
119
+ # @option options [Boolean] :parse_alt (true)
120
+ # Specifies whether the `alt` attributes of `img` tags will be
121
+ # parsed.
122
+ #
123
+ # @option options [Boolean] :parse_comments (false)
124
+ # Specifies whether HTML comment tags will be parsed.
125
+ #
126
+ # @option options [Array<String>] :xpaths
127
+ # Additional list of XPath expressions, to use when parsing
128
+ # spidered pages.
129
+ #
130
+ def initialize(path,options={},&block)
131
+ @proxy = options.fetch(:proxy,Spidr.proxy)
132
+ @user_agent = options[:user_agent]
133
+ @referer = options[:referer]
134
+
135
+ @host = options[:host]
136
+ @host_header = options[:host_header]
137
+ @hosts = Array(options[:hosts])
138
+
139
+ @ignore_links = Array(options[:ignore_links])
19
140
 
20
- super(path,&block)
141
+ @parse_meta = options.fetch(:parse_meta,true)
142
+ @parse_title = options.fetch(:parse_title,true)
143
+ @parse_h1 = options.fetch(:parse_h1,true)
144
+ @parse_h2 = options.fetch(:parse_h2,true)
145
+ @parse_h3 = options.fetch(:parse_h3,true)
146
+ @parse_h4 = options.fetch(:parse_h4,true)
147
+ @parse_h5 = options.fetch(:parse_h5,true)
148
+ @parse_p = options.fetch(:parse_p,true)
149
+ @parse_span = options.fetch(:parse_span,true)
150
+ @parse_alt = options.fetch(:parse_alt,true)
151
+ @parse_comments = options.fetch(:parse_comments,false)
152
+
153
+ @xpaths = Array(options[:xpaths])
154
+
155
+ super(path,options,&block)
21
156
  end
22
157
 
23
158
  #
24
- # Builds the word-list file by spidering the +host+ and parsing the
25
- # inner-text from all HTML pages. If a _block_ is given, it will be
26
- # called before all HTML pages on the +host+ have been parsed.
159
+ # Builds the word-list file by spidering the `host` and parsing the
160
+ # inner-text from all HTML pages.
161
+ #
162
+ # @yield [builder]
163
+ # If a block is given, it will be called before all HTML pages on
164
+ # the `host` have been parsed.
165
+ #
166
+ # @yieldparam [Website] builder
167
+ # The website word-list builder.
27
168
  #
28
169
  def build!(&block)
29
170
  super(&block)
30
171
 
31
- Spidr.host(@host) do |spidr|
172
+ options = {
173
+ :proxy => @proxy,
174
+ :user_agent => @user_agent,
175
+ :referer => @referer,
176
+ :hosts => @hosts,
177
+ :ignore_links => @ignore_links
178
+ }
179
+
180
+ xpaths = []
181
+ xpaths << '//meta/@content' if @parse_meta
182
+ xpaths << '//title' if @parse_title
183
+ xpaths << '//h1' if @parse_h1
184
+ xpaths << '//h2' if @parse_h2
185
+ xpaths << '//h3' if @parse_h3
186
+ xpaths << '//h4' if @parse_h4
187
+ xpaths << '//h5' if @parse_h5
188
+ xpaths << '//p' if @parse_p
189
+ xpaths << '//span' if @parse_span
190
+ xpaths << '//img/@alt' if @parse_alt
191
+ xpaths += @xpaths
192
+
193
+ Spidr.host(@host,options) do |spidr|
32
194
  spidr.every_page do |page|
33
195
  if page.html?
34
- page.doc.search('//h1|//h2|//h3|//h4|//h5|//p|//span').each do |element|
35
- parse(element.inner_text)
196
+ if page.doc
197
+ xpaths.each do |xpath|
198
+ page.doc.search(xpath).each do |element|
199
+ parse(element.inner_text)
200
+ end
201
+ end
202
+ end
203
+
204
+ if (@parse_comments && page.doc)
205
+ page.doc.traverse do |element|
206
+ parse(element.inner_text) if element.comment?
207
+ end
36
208
  end
37
209
  end
38
210
  end
@@ -7,8 +7,13 @@ module Wordlist
7
7
  attr_accessor :path
8
8
 
9
9
  #
10
- # Creates a new FlatFile list with the specified _path_ and given
11
- # _options_.
10
+ # Opens a new FlatFile list.
11
+ #
12
+ # @param [String] path
13
+ # The path to the flat file word-list read from.
14
+ #
15
+ # @param [Hash] options
16
+ # Additional options.
12
17
  #
13
18
  def initialize(path,options={},&block)
14
19
  @path = path
@@ -17,9 +22,15 @@ module Wordlist
17
22
  end
18
23
 
19
24
  #
20
- # Enumerates through every word in the flat-file, passing each
21
- # word to the given _block_.
25
+ # Enumerates through every word in the flat-file.
26
+ #
27
+ # @yield [word]
28
+ # The given block will be passed every word from the word-list.
29
+ #
30
+ # @yieldparam [String] word
31
+ # A word from the word-list.
22
32
  #
33
+ # @example
23
34
  # flat_file.each_word do |word|
24
35
  # puts word
25
36
  # end
@@ -13,42 +13,55 @@ module Wordlist
13
13
  attr_accessor :min_length
14
14
 
15
15
  #
16
- # Creates a new List object with the given _options_. If a _block_
17
- # is given, it will be passed the newly created List object.
16
+ # Creates a new List object.
18
17
  #
19
- # _options_ may include the following keys:
20
- # <tt>:max_length</tt>:: The maximum length of words produced by the
21
- # list.
22
- # <tt>:min_length</tt>:: The minimum length of words produced by the
23
- # list.
18
+ # @param [Hash] options
19
+ # Additional options.
24
20
  #
25
- def initialize(options={},&block)
21
+ # @option options [Integer] :max_length
22
+ # The maximum length of words produced by the list.
23
+ #
24
+ # @option options [Integer] :min_length
25
+ # The minimum length of words produced by the list.
26
+ #
27
+ # @yield [list]
28
+ # If a block is given, it will be passed the new list object.
29
+ #
30
+ # @yieldparam [List] list
31
+ # The new list object.
32
+ #
33
+ def initialize(options={})
26
34
  @mutators = []
27
35
 
28
- @max_length = nil
29
- @min_length = 0
30
-
31
- if options[:max_length]
32
- @max_length = options[:max_length]
33
- end
34
-
35
- if options[:min_length]
36
- @min_length = options[:min_length]
37
- end
36
+ @max_length = options[:max_length]
37
+ @min_length = options.fetch(:min_length,0)
38
38
 
39
- block.call(self) if block
39
+ yield self if block_given?
40
40
  end
41
41
 
42
42
  #
43
- # Adds a mutation rule for the specified _pattern_, to be replaced
44
- # using the specified _substitute_. If a _block_ is given, and the
45
- # _substitute_ data omitted, then the _block_ will be used to
46
- # replace data matched by the _pattern_.
43
+ # Adds a mutation rule for the specified pattern, to be replaced
44
+ # using the specified substitute.
47
45
  #
48
- # list.mutate 'o', '0'
46
+ # @param [String, Regexp] pattern
47
+ # The pattern to recognize text to mutate.
49
48
  #
50
- # list.mutate '0', 0x41
49
+ # @param [String, Integer, nil] substitute
50
+ # The optional text to replace recognized text.
51
51
  #
52
+ # @yield [match]
53
+ # If a block is given, it will be passed the recognized text to be
54
+ # mutated. The return value of the block will be used to replace
55
+ # the recognized text.
56
+ #
57
+ # @yieldparam [String] match
58
+ # The recognized text to be mutated.
59
+ #
60
+ # @example
61
+ # list.mutate 'o', '0'
62
+ #
63
+ # list.mutate '0', 0x41
64
+ #
52
65
  # list.mutate(/[oO]/) do |match|
53
66
  # match.swapcase
54
67
  # end
@@ -58,10 +71,15 @@ module Wordlist
58
71
  end
59
72
 
60
73
  #
61
- # Enumerate through every word in the list, passing each word to
62
- # the given block. By default this method passes nothing to the given
63
- # _block_.
74
+ # Enumerate through every word in the list.
75
+ #
76
+ # @yield [word]
77
+ # The given block will be passed each word in the list.
64
78
  #
79
+ # @yieldparam [String] word
80
+ # A word from the list.
81
+ #
82
+ # @example
65
83
  # list.each_word do |word|
66
84
  # puts word
67
85
  # end
@@ -70,9 +88,15 @@ module Wordlist
70
88
  end
71
89
 
72
90
  #
73
- # Enumerates through every unique word in the list, passing each
74
- # unique word to the given block.
91
+ # Enumerates through every unique word in the list.
92
+ #
93
+ # @yield [word]
94
+ # The given block will be passed each unique word in the list.
95
+ #
96
+ # @yieldparam [String] word
97
+ # A unique word from the list.
75
98
  #
99
+ # @example
76
100
  # list.each_unique do |word|
77
101
  # puts word
78
102
  # end
@@ -91,9 +115,16 @@ module Wordlist
91
115
 
92
116
  #
93
117
  # Enumerates through every unique mutation, of every unique word, using
94
- # the mutator rules define for the list. Every possible unique mutation
95
- # will be passed to the given _block_.
118
+ # the mutator rules define for the list.
119
+ #
120
+ # @yield [word]
121
+ # The given block will be passed every mutation of every unique
122
+ # word in the list.
123
+ #
124
+ # @yieldparam [String] word
125
+ # A mutation of a unique word from the list.
96
126
  #
127
+ # @example
97
128
  # list.each_mutation do |word|
98
129
  # puts word
99
130
  # end
@@ -10,18 +10,33 @@ module Wordlist
10
10
  attr_accessor :substitute
11
11
 
12
12
  #
13
- # Creates a new Mutator with the specified _pattern_ and _substitute_
14
- # data. If a _block_ is given, and the _substitute_ data is omitted, then
15
- # the _block_ will be used to replace data matched by the _pattern_.
13
+ # Creates a new Mutator object.
14
+ #
15
+ # @param [String, Regexp] pattern
16
+ # The pattern which recognizes text to mutate.
17
+ #
18
+ # @param [String, Integer] substitute
19
+ # The optional text to replace recognized text.
20
+ #
21
+ # @yield [match]
22
+ # If a block is given, it will be used to mutate recognized text.
23
+ #
24
+ # @yieldparam [String] match
25
+ # The match text to mutate.
16
26
  #
17
27
  def initialize(pattern,substitute=nil,&block)
18
- @pattern = pattern
28
+ @pattern = pattern
19
29
  @substitute = (substitute || block)
20
30
  end
21
31
 
22
32
  #
23
- # Replaces the specified _matched_ data using the +substitute+, which
24
- # may be either a String, Integer or Proc.
33
+ # Mutates the given text.
34
+ #
35
+ # @param [String] matched
36
+ # The recognized text to be mutated.
37
+ #
38
+ # @return [String]
39
+ # The mutated text.
25
40
  #
26
41
  def replace(matched)
27
42
  result = if @substitute.kind_of?(Proc)
@@ -40,9 +55,20 @@ module Wordlist
40
55
  end
41
56
 
42
57
  #
43
- # Performs every possible replacement of data, which matches the
44
- # mutators +pattern+ using the replace method, on the specified _word_
45
- # passing each variation to the given _block_.
58
+ # Enumerates over every possible mutation of the given word.
59
+ #
60
+ # @param [String] word
61
+ # The word to mutate.
62
+ #
63
+ # @yield [mutation]
64
+ # The given block will be passed every possible mutation of the
65
+ # given word.
66
+ #
67
+ # @yieldparam [String] mutation
68
+ # One possible mutation of the given word.
69
+ #
70
+ # @return [String]
71
+ # The original word.
46
72
  #
47
73
  def each(word)
48
74
  choices = 0
@@ -76,6 +102,9 @@ module Wordlist
76
102
  #
77
103
  # Inspects the mutator.
78
104
  #
105
+ # @return [String]
106
+ # The inspected mutator.
107
+ #
79
108
  def inspect
80
109
  "#{@pattern.inspect} -> #{@substitute.inspect}"
81
110
  end