wordlist 0.1.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (148) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ruby.yml +27 -0
  3. data/.gitignore +6 -3
  4. data/ChangeLog.md +45 -1
  5. data/Gemfile +13 -0
  6. data/LICENSE.txt +1 -3
  7. data/README.md +266 -61
  8. data/Rakefile +7 -32
  9. data/benchmarks.rb +115 -0
  10. data/bin/wordlist +4 -7
  11. data/data/stop_words/ar.txt +104 -0
  12. data/data/stop_words/bg.txt +259 -0
  13. data/data/stop_words/bn.txt +363 -0
  14. data/data/stop_words/ca.txt +126 -0
  15. data/data/stop_words/cs.txt +138 -0
  16. data/data/stop_words/da.txt +101 -0
  17. data/data/stop_words/de.txt +129 -0
  18. data/data/stop_words/el.txt +79 -0
  19. data/data/stop_words/en.txt +175 -0
  20. data/data/stop_words/es.txt +178 -0
  21. data/data/stop_words/eu.txt +98 -0
  22. data/data/stop_words/fa.txt +332 -0
  23. data/data/stop_words/fi.txt +747 -0
  24. data/data/stop_words/fr.txt +116 -0
  25. data/data/stop_words/ga.txt +109 -0
  26. data/data/stop_words/gl.txt +160 -0
  27. data/data/stop_words/he.txt +499 -0
  28. data/data/stop_words/hi.txt +97 -0
  29. data/data/stop_words/hr.txt +179 -0
  30. data/data/stop_words/hu.txt +35 -0
  31. data/data/stop_words/hy.txt +45 -0
  32. data/data/stop_words/id.txt +357 -0
  33. data/data/stop_words/it.txt +134 -0
  34. data/data/stop_words/ja.txt +44 -0
  35. data/data/stop_words/ko.txt +677 -0
  36. data/data/stop_words/ku.txt +63 -0
  37. data/data/stop_words/lt.txt +507 -0
  38. data/data/stop_words/lv.txt +163 -0
  39. data/data/stop_words/mr.txt +99 -0
  40. data/data/stop_words/nl.txt +48 -0
  41. data/data/stop_words/no.txt +172 -0
  42. data/data/stop_words/pl.txt +138 -0
  43. data/data/stop_words/pt.txt +147 -0
  44. data/data/stop_words/ro.txt +281 -0
  45. data/data/stop_words/ru.txt +421 -0
  46. data/data/stop_words/sk.txt +173 -0
  47. data/data/stop_words/sv.txt +386 -0
  48. data/data/stop_words/th.txt +115 -0
  49. data/data/stop_words/tr.txt +114 -0
  50. data/data/stop_words/uk.txt +28 -0
  51. data/data/stop_words/ur.txt +513 -0
  52. data/data/stop_words/zh.txt +125 -0
  53. data/gemspec.yml +4 -10
  54. data/lib/wordlist/abstract_wordlist.rb +24 -0
  55. data/lib/wordlist/builder.rb +170 -138
  56. data/lib/wordlist/cli.rb +458 -0
  57. data/lib/wordlist/compression/reader.rb +72 -0
  58. data/lib/wordlist/compression/writer.rb +80 -0
  59. data/lib/wordlist/exceptions.rb +31 -0
  60. data/lib/wordlist/file.rb +176 -0
  61. data/lib/wordlist/format.rb +38 -0
  62. data/lib/wordlist/lexer/lang.rb +32 -0
  63. data/lib/wordlist/lexer/stop_words.rb +68 -0
  64. data/lib/wordlist/lexer.rb +218 -0
  65. data/lib/wordlist/list_methods.rb +462 -0
  66. data/lib/wordlist/modifiers/capitalize.rb +45 -0
  67. data/lib/wordlist/modifiers/downcase.rb +45 -0
  68. data/lib/wordlist/modifiers/gsub.rb +51 -0
  69. data/lib/wordlist/modifiers/modifier.rb +44 -0
  70. data/lib/wordlist/modifiers/mutate.rb +133 -0
  71. data/lib/wordlist/modifiers/mutate_case.rb +25 -0
  72. data/lib/wordlist/modifiers/sub.rb +97 -0
  73. data/lib/wordlist/modifiers/tr.rb +71 -0
  74. data/lib/wordlist/modifiers/upcase.rb +45 -0
  75. data/lib/wordlist/modifiers.rb +8 -0
  76. data/lib/wordlist/operators/binary_operator.rb +38 -0
  77. data/lib/wordlist/operators/concat.rb +47 -0
  78. data/lib/wordlist/operators/intersect.rb +55 -0
  79. data/lib/wordlist/operators/operator.rb +29 -0
  80. data/lib/wordlist/operators/power.rb +72 -0
  81. data/lib/wordlist/operators/product.rb +50 -0
  82. data/lib/wordlist/operators/subtract.rb +54 -0
  83. data/lib/wordlist/operators/unary_operator.rb +29 -0
  84. data/lib/wordlist/operators/union.rb +61 -0
  85. data/lib/wordlist/operators/unique.rb +52 -0
  86. data/lib/wordlist/operators.rb +7 -0
  87. data/lib/wordlist/unique_filter.rb +40 -61
  88. data/lib/wordlist/version.rb +1 -1
  89. data/lib/wordlist/words.rb +71 -0
  90. data/lib/wordlist.rb +103 -2
  91. data/spec/abstract_list_spec.rb +18 -0
  92. data/spec/builder_spec.rb +220 -76
  93. data/spec/cli_spec.rb +801 -0
  94. data/spec/compression/reader_spec.rb +137 -0
  95. data/spec/compression/writer_spec.rb +194 -0
  96. data/spec/file_spec.rb +258 -0
  97. data/spec/fixtures/wordlist.txt +15 -0
  98. data/spec/fixtures/wordlist.txt.bz2 +0 -0
  99. data/spec/fixtures/wordlist.txt.gz +0 -0
  100. data/spec/fixtures/wordlist.txt.xz +0 -0
  101. data/spec/fixtures/wordlist_with_ambiguous_format +3 -0
  102. data/spec/fixtures/wordlist_with_comments.txt +19 -0
  103. data/spec/fixtures/wordlist_with_empty_lines.txt +19 -0
  104. data/spec/format_spec.rb +50 -0
  105. data/spec/helpers/text.rb +3 -3
  106. data/spec/helpers/wordlist.rb +2 -2
  107. data/spec/lexer/lang_spec.rb +70 -0
  108. data/spec/lexer/stop_words_spec.rb +77 -0
  109. data/spec/lexer_spec.rb +652 -0
  110. data/spec/list_methods_spec.rb +181 -0
  111. data/spec/modifiers/capitalize_spec.rb +27 -0
  112. data/spec/modifiers/downcase_spec.rb +27 -0
  113. data/spec/modifiers/gsub_spec.rb +59 -0
  114. data/spec/modifiers/modifier_spec.rb +20 -0
  115. data/spec/modifiers/mutate_case_spec.rb +46 -0
  116. data/spec/modifiers/mutate_spec.rb +39 -0
  117. data/spec/modifiers/sub_spec.rb +98 -0
  118. data/spec/modifiers/tr_spec.rb +46 -0
  119. data/spec/modifiers/upcase_spec.rb +27 -0
  120. data/spec/operators/binary_operator_spec.rb +19 -0
  121. data/spec/operators/concat_spec.rb +26 -0
  122. data/spec/operators/intersect_spec.rb +37 -0
  123. data/spec/operators/operator_spec.rb +16 -0
  124. data/spec/operators/power_spec.rb +57 -0
  125. data/spec/operators/product_spec.rb +39 -0
  126. data/spec/operators/subtract_spec.rb +37 -0
  127. data/spec/operators/union_spec.rb +37 -0
  128. data/spec/operators/unique_spec.rb +25 -0
  129. data/spec/spec_helper.rb +2 -1
  130. data/spec/unique_filter_spec.rb +108 -18
  131. data/spec/wordlist_spec.rb +55 -3
  132. data/spec/words_spec.rb +41 -0
  133. metadata +183 -120
  134. data/lib/wordlist/builders/website.rb +0 -216
  135. data/lib/wordlist/builders.rb +0 -1
  136. data/lib/wordlist/flat_file.rb +0 -47
  137. data/lib/wordlist/list.rb +0 -162
  138. data/lib/wordlist/mutator.rb +0 -113
  139. data/lib/wordlist/parsers.rb +0 -74
  140. data/lib/wordlist/runners/list.rb +0 -116
  141. data/lib/wordlist/runners/runner.rb +0 -67
  142. data/lib/wordlist/runners.rb +0 -2
  143. data/scripts/benchmark +0 -59
  144. data/scripts/text/comedy_of_errors.txt +0 -4011
  145. data/spec/flat_file_spec.rb +0 -25
  146. data/spec/list_spec.rb +0 -58
  147. data/spec/mutator_spec.rb +0 -43
  148. data/spec/parsers_spec.rb +0 -118
metadata CHANGED
@@ -1,167 +1,230 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: wordlist
3
- version: !ruby/object:Gem::Version
4
- hash: 25
5
- prerelease:
6
- segments:
7
- - 0
8
- - 1
9
- - 1
10
- version: 0.1.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
11
5
  platform: ruby
12
- authors:
6
+ authors:
13
7
  - Postmodern
14
- autorequire:
8
+ autorequire:
15
9
  bindir: bin
16
10
  cert_chain: []
17
-
18
- date: 2012-06-12 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: spidr
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ~>
27
- - !ruby/object:Gem::Version
28
- hash: 15
29
- segments:
30
- - 0
31
- - 2
32
- version: "0.2"
33
- type: :runtime
34
- version_requirements: *id001
35
- - !ruby/object:Gem::Dependency
11
+ date: 2021-11-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
36
14
  name: rubygems-tasks
37
- prerelease: false
38
- requirement: &id002 !ruby/object:Gem::Requirement
39
- none: false
40
- requirements:
41
- - - ~>
42
- - !ruby/object:Gem::Version
43
- hash: 9
44
- segments:
45
- - 0
46
- - 1
47
- version: "0.1"
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.1'
48
20
  type: :development
49
- version_requirements: *id002
50
- - !ruby/object:Gem::Dependency
51
- name: rspec
52
21
  prerelease: false
53
- requirement: &id003 !ruby/object:Gem::Requirement
54
- none: false
55
- requirements:
56
- - - ~>
57
- - !ruby/object:Gem::Version
58
- hash: 11
59
- segments:
60
- - 2
61
- - 4
62
- version: "2.4"
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.4'
63
34
  type: :development
64
- version_requirements: *id003
65
- - !ruby/object:Gem::Dependency
66
- name: yard
67
35
  prerelease: false
68
- requirement: &id004 !ruby/object:Gem::Requirement
69
- none: false
70
- requirements:
71
- - - ~>
72
- - !ruby/object:Gem::Version
73
- hash: 27
74
- segments:
75
- - 0
76
- - 8
77
- version: "0.8"
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
78
48
  type: :development
79
- version_requirements: *id004
80
- description: A Ruby library for generating and working with word-lists. Wordlist allows one to efficiently generate unique word-lists from arbitrary text or other sources, such as website content. Wordlist can also quickly enumerate through words within an existing word-list, applying multiple mutation rules to each word in the list.
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ description: Wordlist is a Ruby library for reading, manipulating, and creating wordlists,
56
+ efficiently.
81
57
  email: postmodern.mod3@gmail.com
82
- executables:
58
+ executables:
83
59
  - wordlist
84
60
  extensions: []
85
-
86
- extra_rdoc_files:
61
+ extra_rdoc_files:
87
62
  - ChangeLog.md
88
63
  - LICENSE.txt
89
64
  - README.md
90
- files:
91
- - .document
92
- - .gitignore
93
- - .rspec
94
- - .yardopts
65
+ files:
66
+ - ".document"
67
+ - ".github/workflows/ruby.yml"
68
+ - ".gitignore"
69
+ - ".rspec"
70
+ - ".yardopts"
95
71
  - ChangeLog.md
72
+ - Gemfile
96
73
  - LICENSE.txt
97
74
  - README.md
98
75
  - Rakefile
76
+ - benchmarks.rb
99
77
  - bin/wordlist
78
+ - data/stop_words/ar.txt
79
+ - data/stop_words/bg.txt
80
+ - data/stop_words/bn.txt
81
+ - data/stop_words/ca.txt
82
+ - data/stop_words/cs.txt
83
+ - data/stop_words/da.txt
84
+ - data/stop_words/de.txt
85
+ - data/stop_words/el.txt
86
+ - data/stop_words/en.txt
87
+ - data/stop_words/es.txt
88
+ - data/stop_words/eu.txt
89
+ - data/stop_words/fa.txt
90
+ - data/stop_words/fi.txt
91
+ - data/stop_words/fr.txt
92
+ - data/stop_words/ga.txt
93
+ - data/stop_words/gl.txt
94
+ - data/stop_words/he.txt
95
+ - data/stop_words/hi.txt
96
+ - data/stop_words/hr.txt
97
+ - data/stop_words/hu.txt
98
+ - data/stop_words/hy.txt
99
+ - data/stop_words/id.txt
100
+ - data/stop_words/it.txt
101
+ - data/stop_words/ja.txt
102
+ - data/stop_words/ko.txt
103
+ - data/stop_words/ku.txt
104
+ - data/stop_words/lt.txt
105
+ - data/stop_words/lv.txt
106
+ - data/stop_words/mr.txt
107
+ - data/stop_words/nl.txt
108
+ - data/stop_words/no.txt
109
+ - data/stop_words/pl.txt
110
+ - data/stop_words/pt.txt
111
+ - data/stop_words/ro.txt
112
+ - data/stop_words/ru.txt
113
+ - data/stop_words/sk.txt
114
+ - data/stop_words/sv.txt
115
+ - data/stop_words/th.txt
116
+ - data/stop_words/tr.txt
117
+ - data/stop_words/uk.txt
118
+ - data/stop_words/ur.txt
119
+ - data/stop_words/zh.txt
100
120
  - gemspec.yml
101
121
  - lib/wordlist.rb
122
+ - lib/wordlist/abstract_wordlist.rb
102
123
  - lib/wordlist/builder.rb
103
- - lib/wordlist/builders.rb
104
- - lib/wordlist/builders/website.rb
105
- - lib/wordlist/flat_file.rb
106
- - lib/wordlist/list.rb
107
- - lib/wordlist/mutator.rb
108
- - lib/wordlist/parsers.rb
109
- - lib/wordlist/runners.rb
110
- - lib/wordlist/runners/list.rb
111
- - lib/wordlist/runners/runner.rb
124
+ - lib/wordlist/cli.rb
125
+ - lib/wordlist/compression/reader.rb
126
+ - lib/wordlist/compression/writer.rb
127
+ - lib/wordlist/exceptions.rb
128
+ - lib/wordlist/file.rb
129
+ - lib/wordlist/format.rb
130
+ - lib/wordlist/lexer.rb
131
+ - lib/wordlist/lexer/lang.rb
132
+ - lib/wordlist/lexer/stop_words.rb
133
+ - lib/wordlist/list_methods.rb
134
+ - lib/wordlist/modifiers.rb
135
+ - lib/wordlist/modifiers/capitalize.rb
136
+ - lib/wordlist/modifiers/downcase.rb
137
+ - lib/wordlist/modifiers/gsub.rb
138
+ - lib/wordlist/modifiers/modifier.rb
139
+ - lib/wordlist/modifiers/mutate.rb
140
+ - lib/wordlist/modifiers/mutate_case.rb
141
+ - lib/wordlist/modifiers/sub.rb
142
+ - lib/wordlist/modifiers/tr.rb
143
+ - lib/wordlist/modifiers/upcase.rb
144
+ - lib/wordlist/operators.rb
145
+ - lib/wordlist/operators/binary_operator.rb
146
+ - lib/wordlist/operators/concat.rb
147
+ - lib/wordlist/operators/intersect.rb
148
+ - lib/wordlist/operators/operator.rb
149
+ - lib/wordlist/operators/power.rb
150
+ - lib/wordlist/operators/product.rb
151
+ - lib/wordlist/operators/subtract.rb
152
+ - lib/wordlist/operators/unary_operator.rb
153
+ - lib/wordlist/operators/union.rb
154
+ - lib/wordlist/operators/unique.rb
112
155
  - lib/wordlist/unique_filter.rb
113
156
  - lib/wordlist/version.rb
114
- - scripts/benchmark
115
- - scripts/text/comedy_of_errors.txt
157
+ - lib/wordlist/words.rb
158
+ - spec/abstract_list_spec.rb
116
159
  - spec/builder_examples.rb
117
160
  - spec/builder_spec.rb
118
161
  - spec/classes/parser_class.rb
119
162
  - spec/classes/test_list.rb
120
- - spec/flat_file_spec.rb
163
+ - spec/cli_spec.rb
164
+ - spec/compression/reader_spec.rb
165
+ - spec/compression/writer_spec.rb
166
+ - spec/file_spec.rb
167
+ - spec/fixtures/wordlist.txt
168
+ - spec/fixtures/wordlist.txt.bz2
169
+ - spec/fixtures/wordlist.txt.gz
170
+ - spec/fixtures/wordlist.txt.xz
171
+ - spec/fixtures/wordlist_with_ambiguous_format
172
+ - spec/fixtures/wordlist_with_comments.txt
173
+ - spec/fixtures/wordlist_with_empty_lines.txt
174
+ - spec/format_spec.rb
121
175
  - spec/helpers/text.rb
122
176
  - spec/helpers/wordlist.rb
123
- - spec/list_spec.rb
124
- - spec/mutator_spec.rb
125
- - spec/parsers_spec.rb
177
+ - spec/lexer/lang_spec.rb
178
+ - spec/lexer/stop_words_spec.rb
179
+ - spec/lexer_spec.rb
180
+ - spec/list_methods_spec.rb
181
+ - spec/modifiers/capitalize_spec.rb
182
+ - spec/modifiers/downcase_spec.rb
183
+ - spec/modifiers/gsub_spec.rb
184
+ - spec/modifiers/modifier_spec.rb
185
+ - spec/modifiers/mutate_case_spec.rb
186
+ - spec/modifiers/mutate_spec.rb
187
+ - spec/modifiers/sub_spec.rb
188
+ - spec/modifiers/tr_spec.rb
189
+ - spec/modifiers/upcase_spec.rb
190
+ - spec/operators/binary_operator_spec.rb
191
+ - spec/operators/concat_spec.rb
192
+ - spec/operators/intersect_spec.rb
193
+ - spec/operators/operator_spec.rb
194
+ - spec/operators/power_spec.rb
195
+ - spec/operators/product_spec.rb
196
+ - spec/operators/subtract_spec.rb
197
+ - spec/operators/union_spec.rb
198
+ - spec/operators/unique_spec.rb
126
199
  - spec/spec_helper.rb
127
200
  - spec/text/flat_file.txt
128
201
  - spec/text/previous_wordlist.txt
129
202
  - spec/text/sample.txt
130
203
  - spec/unique_filter_spec.rb
131
204
  - spec/wordlist_spec.rb
205
+ - spec/words_spec.rb
132
206
  - wordlist.gemspec
133
- homepage: https://github.com/sophsec/wordlist
134
- licenses:
207
+ homepage: https://github.com/postmodern/wordlist.rb
208
+ licenses:
135
209
  - MIT
136
- post_install_message:
210
+ metadata: {}
211
+ post_install_message:
137
212
  rdoc_options: []
138
-
139
- require_paths:
213
+ require_paths:
140
214
  - lib
141
- required_ruby_version: !ruby/object:Gem::Requirement
142
- none: false
143
- requirements:
215
+ required_ruby_version: !ruby/object:Gem::Requirement
216
+ requirements:
144
217
  - - ">="
145
- - !ruby/object:Gem::Version
146
- hash: 3
147
- segments:
148
- - 0
149
- version: "0"
150
- required_rubygems_version: !ruby/object:Gem::Requirement
151
- none: false
152
- requirements:
218
+ - !ruby/object:Gem::Version
219
+ version: '0'
220
+ required_rubygems_version: !ruby/object:Gem::Requirement
221
+ requirements:
153
222
  - - ">="
154
- - !ruby/object:Gem::Version
155
- hash: 3
156
- segments:
157
- - 0
158
- version: "0"
223
+ - !ruby/object:Gem::Version
224
+ version: '0'
159
225
  requirements: []
160
-
161
- rubyforge_project:
162
- rubygems_version: 1.8.23
163
- signing_key:
164
- specification_version: 3
165
- summary: A Ruby library for generating and working with word-lists.
226
+ rubygems_version: 3.2.22
227
+ signing_key:
228
+ specification_version: 4
229
+ summary: Ruby library for reading, manipulating, and creating wordlists.
166
230
  test_files: []
167
-
@@ -1,216 +0,0 @@
1
- require 'wordlist/builder'
2
-
3
- require 'spidr'
4
-
5
- module Wordlist
6
- module Builders
7
- class Website < Builder
8
-
9
- # Proxy to use
10
- attr_accessor :proxy
11
-
12
- # User-Agent to use
13
- attr_accessor :user_agent
14
-
15
- # Referer URL to use
16
- attr_accessor :referer
17
-
18
- # Host to spider
19
- attr_accessor :host
20
-
21
- # HTTP Host Header to use in all requests.
22
- attr_accessor :host_header
23
-
24
- # Additional hosts that can be spidered
25
- attr_reader :hosts
26
-
27
- # Links to ignore while spidering
28
- attr_reader :ignore_links
29
-
30
- # Specifies whether the `content` attribute of `meta` tags will be
31
- # parsed
32
- attr_accessor :parse_meta
33
-
34
- # Specifies whether `title` tags will be parsed
35
- attr_accessor :parse_title
36
-
37
- # Specifies whether `h1` tags will be parsed
38
- attr_accessor :parse_h1
39
-
40
- # Specifies whether `h2` tags will be parsed
41
- attr_accessor :parse_h2
42
-
43
- # Specifies whether `h3` tags will be parsed
44
- attr_accessor :parse_h3
45
-
46
- # Specifies whether `h4` tags will be parsed
47
- attr_accessor :parse_h4
48
-
49
- # Specifies whether `h5` tags will be parsed
50
- attr_accessor :parse_h5
51
-
52
- # Specifies whether `p` tags will be parsed
53
- attr_accessor :parse_p
54
-
55
- # Specifies whether `span` tags will be parsed
56
- attr_accessor :parse_span
57
-
58
- # Specifies whether the `alt` attributes of `img` tags will be parsed
59
- attr_accessor :parse_alt
60
-
61
- # Specifies whether HTML comment tags will be parsed
62
- attr_accessor :parse_comments
63
-
64
- # Additional XPath expressions to use to parse spidered pages
65
- attr_reader :xpaths
66
-
67
- #
68
- # Creates a new Website builder object.
69
- #
70
- # @param [String] path
71
- # The path to the word-list to build.
72
- #
73
- # @param [Hash] options
74
- # Additional options.
75
- #
76
- # @option options [Hash] :proxy
77
- # The Hash of proxy information to use.
78
- #
79
- # @option options [String] :user_agent
80
- # The User-Agent string to send with each request.
81
- #
82
- # @option options [String] :referer
83
- # The Referer URL to send with each request.
84
- #
85
- # @option options [String] :host_header
86
- # The HTTP Host header to use in all requests.
87
- #
88
- # @option options [Array<String, Regexp, Proc>] :ignore_links
89
- # Links to ignore while spidering.
90
- #
91
- # @option options [Boolean] :parse_meta (true)
92
- # Specifies whether the `content` attribute of `meta` tags will be
93
- # parsed.
94
- #
95
- # @option options [Boolean] :parse_title (true)
96
- # Specifies whether `title` tags will be parsed.
97
- #
98
- # @option options [Boolean] :parse_h1 (true)
99
- # Specifies whether `h1` tags will be parsed.
100
- #
101
- # @option options [Boolean] :parse_h2 (true)
102
- # Specifies whether `h2` tags will be parsed.
103
- #
104
- # @option options [Boolean] :parse_h3 (true)
105
- # Specifies whether `h3` tags will be parsed.
106
- #
107
- # @option options [Boolean] :parse_h4 (true)
108
- # Specifies whether `h4` tags will be parsed.
109
- #
110
- # @option options [Boolean] :parse_h5 (true)
111
- # Specifies whether `h5` tags will be parsed.
112
- #
113
- # @option options [Boolean] :parse_p (true)
114
- # Specifies whether `p` tags will be parsed.
115
- #
116
- # @option options [Boolean] :parse_span (true)
117
- # Specifies whether `span` tags will be parsed.
118
- #
119
- # @option options [Boolean] :parse_alt (true)
120
- # Specifies whether the `alt` attributes of `img` tags will be
121
- # parsed.
122
- #
123
- # @option options [Boolean] :parse_comments (false)
124
- # Specifies whether HTML comment tags will be parsed.
125
- #
126
- # @option options [Array<String>] :xpaths
127
- # Additional list of XPath expressions, to use when parsing
128
- # spidered pages.
129
- #
130
- def initialize(path,options={},&block)
131
- @proxy = options.fetch(:proxy,Spidr.proxy)
132
- @user_agent = options[:user_agent]
133
- @referer = options[:referer]
134
-
135
- @host = options[:host]
136
- @host_header = options[:host_header]
137
- @hosts = Array(options[:hosts])
138
-
139
- @ignore_links = Array(options[:ignore_links])
140
-
141
- @parse_meta = options.fetch(:parse_meta,true)
142
- @parse_title = options.fetch(:parse_title,true)
143
- @parse_h1 = options.fetch(:parse_h1,true)
144
- @parse_h2 = options.fetch(:parse_h2,true)
145
- @parse_h3 = options.fetch(:parse_h3,true)
146
- @parse_h4 = options.fetch(:parse_h4,true)
147
- @parse_h5 = options.fetch(:parse_h5,true)
148
- @parse_p = options.fetch(:parse_p,true)
149
- @parse_span = options.fetch(:parse_span,true)
150
- @parse_alt = options.fetch(:parse_alt,true)
151
- @parse_comments = options.fetch(:parse_comments,false)
152
-
153
- @xpaths = Array(options[:xpaths])
154
-
155
- super(path,options,&block)
156
- end
157
-
158
- #
159
- # Builds the word-list file by spidering the `host` and parsing the
160
- # inner-text from all HTML pages.
161
- #
162
- # @yield [builder]
163
- # If a block is given, it will be called before all HTML pages on
164
- # the `host` have been parsed.
165
- #
166
- # @yieldparam [Website] builder
167
- # The website word-list builder.
168
- #
169
- def build!(&block)
170
- super(&block)
171
-
172
- options = {
173
- :proxy => @proxy,
174
- :user_agent => @user_agent,
175
- :referer => @referer,
176
- :hosts => @hosts,
177
- :ignore_links => @ignore_links
178
- }
179
-
180
- xpaths = []
181
- xpaths << '//meta/@content' if @parse_meta
182
- xpaths << '//title' if @parse_title
183
- xpaths << '//h1' if @parse_h1
184
- xpaths << '//h2' if @parse_h2
185
- xpaths << '//h3' if @parse_h3
186
- xpaths << '//h4' if @parse_h4
187
- xpaths << '//h5' if @parse_h5
188
- xpaths << '//p' if @parse_p
189
- xpaths << '//span' if @parse_span
190
- xpaths << '//img/@alt' if @parse_alt
191
- xpaths += @xpaths
192
-
193
- Spidr.host(@host,options) do |spidr|
194
- spidr.every_page do |page|
195
- if page.html?
196
- if page.doc
197
- xpaths.each do |xpath|
198
- page.doc.search(xpath).each do |element|
199
- parse(element.inner_text)
200
- end
201
- end
202
- end
203
-
204
- if (@parse_comments && page.doc)
205
- page.doc.traverse do |element|
206
- parse(element.inner_text) if element.comment?
207
- end
208
- end
209
- end
210
- end
211
- end
212
- end
213
-
214
- end
215
- end
216
- end
@@ -1 +0,0 @@
1
- require 'wordlist/builders/website'
@@ -1,47 +0,0 @@
1
- require 'wordlist/list'
2
-
3
- module Wordlist
4
- class FlatFile < List
5
-
6
- # The path to the flat-file
7
- attr_accessor :path
8
-
9
- #
10
- # Opens a new FlatFile list.
11
- #
12
- # @param [String] path
13
- # The path to the flat file word-list read from.
14
- #
15
- # @param [Hash] options
16
- # Additional options.
17
- #
18
- def initialize(path,options={},&block)
19
- @path = path
20
-
21
- super(options,&block)
22
- end
23
-
24
- #
25
- # Enumerates through every word in the flat-file.
26
- #
27
- # @yield [word]
28
- # The given block will be passed every word from the word-list.
29
- #
30
- # @yieldparam [String] word
31
- # A word from the word-list.
32
- #
33
- # @example
34
- # flat_file.each_word do |word|
35
- # puts word
36
- # end
37
- #
38
- def each_word(&block)
39
- File.open(@path) do |file|
40
- file.each_line do |line|
41
- yield line.chomp
42
- end
43
- end
44
- end
45
-
46
- end
47
- end