watobo 0.9.19 → 0.9.20
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +104 -0
- data/bin/nfq_server.rb +8 -20
- data/bin/watobo_gui.rb +8 -20
- data/config/forwarding_proxy.yml +2 -2
- data/lib/watobo.rb +12 -22
- data/lib/watobo/adapters.rb +12 -24
- data/lib/watobo/adapters/data_store.rb +76 -66
- data/lib/watobo/adapters/file/file_store.rb +295 -307
- data/lib/watobo/adapters/session_store.rb +13 -25
- data/lib/watobo/ca.rb +9 -21
- data/lib/watobo/config.rb +205 -217
- data/lib/watobo/constants.rb +8 -20
- data/lib/watobo/core.rb +11 -23
- data/lib/watobo/core/active_check.rb +11 -21
- data/lib/watobo/core/active_checks.rb +57 -69
- data/lib/watobo/core/ca.rb +388 -398
- data/lib/watobo/core/cert_store.rb +42 -54
- data/lib/watobo/core/chat.rb +100 -112
- data/lib/watobo/core/chats.rb +271 -275
- data/lib/watobo/core/client_cert_store.rb +33 -45
- data/lib/watobo/core/conversation.rb +56 -68
- data/lib/watobo/core/cookie.rb +31 -43
- data/lib/watobo/core/finding.rb +74 -86
- data/lib/watobo/core/findings.rb +113 -125
- data/lib/watobo/core/forwarding_proxy.rb +44 -35
- data/lib/watobo/core/fuzz_gen.rb +8 -20
- data/lib/watobo/core/intercept_carver.rb +176 -188
- data/lib/watobo/core/intercept_filter.rb +243 -255
- data/lib/watobo/core/interceptor.rb +106 -118
- data/lib/watobo/core/min_class.rb +12 -24
- data/lib/watobo/core/netfilter_queue.rb +178 -190
- data/lib/watobo/core/ott_cache.rb +152 -148
- data/lib/watobo/core/parameter.rb +53 -58
- data/lib/watobo/core/passive_check.rb +8 -20
- data/lib/watobo/core/passive_checks.rb +56 -68
- data/lib/watobo/core/passive_scanner.rb +54 -66
- data/lib/watobo/core/plugin.rb +19 -31
- data/lib/watobo/core/project.rb +8 -20
- data/lib/watobo/core/proxy.rb +51 -63
- data/lib/watobo/core/request.rb +128 -120
- data/lib/watobo/core/response.rb +59 -61
- data/lib/watobo/core/scanner.rb +8 -20
- data/lib/watobo/core/scanner3.rb +413 -425
- data/lib/watobo/core/scope.rb +91 -103
- data/lib/watobo/core/session.rb +109 -87
- data/lib/watobo/core/sid_cache.rb +106 -118
- data/lib/watobo/core/subscriber.rb +33 -45
- data/lib/watobo/defaults.rb +29 -41
- data/lib/watobo/external/diff/lcs.rb +8 -20
- data/lib/watobo/external/diff/lcs/array.rb +8 -20
- data/lib/watobo/external/diff/lcs/block.rb +8 -20
- data/lib/watobo/external/diff/lcs/callbacks.rb +8 -20
- data/lib/watobo/external/diff/lcs/change.rb +8 -20
- data/lib/watobo/external/diff/lcs/hunk.rb +8 -20
- data/lib/watobo/external/diff/lcs/ldiff.rb +8 -20
- data/lib/watobo/external/diff/lcs/string.rb +8 -20
- data/lib/watobo/externals.rb +14 -26
- data/lib/watobo/framework.rb +12 -24
- data/lib/watobo/framework/create_project.rb +68 -80
- data/lib/watobo/framework/init.rb +8 -20
- data/lib/watobo/framework/init_modules.rb +8 -20
- data/lib/watobo/framework/license_text.rb +36 -48
- data/lib/watobo/framework/load_chat.rb +21 -33
- data/lib/watobo/gui.rb +121 -133
- data/lib/watobo/gui/about_watobo.rb +8 -20
- data/lib/watobo/gui/browser_preview.rb +8 -20
- data/lib/watobo/gui/certificate_dialog.rb +8 -20
- data/lib/watobo/gui/chat_diff.rb +11 -21
- data/lib/watobo/gui/chatviewer_frame.rb +10 -22
- data/lib/watobo/gui/checkboxtree.rb +8 -20
- data/lib/watobo/gui/checks_policy_frame.rb +8 -20
- data/lib/watobo/gui/client_cert_dialog.rb +10 -21
- data/lib/watobo/gui/confirm_scan_dialog.rb +8 -20
- data/lib/watobo/gui/conversation_table.rb +54 -44
- data/lib/watobo/gui/conversation_table_ctrl.rb +215 -227
- data/lib/watobo/gui/conversation_table_ctrl2.rb +385 -393
- data/lib/watobo/gui/csrf_token_dialog.rb +11 -25
- data/lib/watobo/gui/custom_viewer.rb +357 -369
- data/lib/watobo/gui/dashboard.rb +8 -20
- data/lib/watobo/gui/define_scope_frame.rb +8 -20
- data/lib/watobo/gui/differ_frame.rb +223 -235
- data/lib/watobo/gui/edit_comment.rb +8 -20
- data/lib/watobo/gui/edit_scope_dialog.rb +8 -20
- data/lib/watobo/gui/export_dialog.rb +114 -0
- data/lib/watobo/gui/finding_info.rb +9 -21
- data/lib/watobo/gui/findings_tree.rb +8 -20
- data/lib/watobo/gui/full_scan_dialog.rb +8 -20
- data/lib/watobo/gui/fuzzer_gui.rb +8 -20
- data/lib/watobo/gui/goto_url_dialog.rb +78 -90
- data/lib/watobo/gui/hex_viewer.rb +25 -27
- data/lib/watobo/gui/html_viewer.rb +295 -307
- data/lib/watobo/gui/intercept_filter_dialog.rb +196 -208
- data/lib/watobo/gui/interceptor_gui.rb +1046 -1041
- data/lib/watobo/gui/interceptor_settings_dialog.rb +8 -20
- data/lib/watobo/gui/list_box.rb +109 -121
- data/lib/watobo/gui/log_file_viewer.rb +40 -52
- data/lib/watobo/gui/log_viewer.rb +87 -99
- data/lib/watobo/gui/login_wizzard.rb +8 -20
- data/lib/watobo/gui/main_window.rb +34 -33
- data/lib/watobo/gui/manual_request_editor.rb +25 -35
- data/lib/watobo/gui/master_pw_dialog.rb +8 -20
- data/lib/watobo/gui/mixins/gui_settings.rb +37 -49
- data/lib/watobo/gui/page_tree.rb +225 -237
- data/lib/watobo/gui/password_policy_dialog.rb +8 -20
- data/lib/watobo/gui/plugin_board.rb +8 -20
- data/lib/watobo/gui/preferences_dialog.rb +8 -20
- data/lib/watobo/gui/progress_window.rb +8 -20
- data/lib/watobo/gui/project_wizzard.rb +8 -20
- data/lib/watobo/gui/proxy_dialog.rb +117 -85
- data/lib/watobo/gui/quick_scan_dialog.rb +8 -20
- data/lib/watobo/gui/request_builder_frame.rb +125 -122
- data/lib/watobo/gui/request_editor.rb +53 -28
- data/lib/watobo/gui/rewrite_filters_dialog.rb +402 -414
- data/lib/watobo/gui/rewrite_rules_dialog.rb +380 -392
- data/lib/watobo/gui/save_chat_dialog.rb +148 -160
- data/lib/watobo/gui/scanner_settings_dialog.rb +8 -20
- data/lib/watobo/gui/select_chat_dialog.rb +8 -20
- data/lib/watobo/gui/session_management_dialog.rb +8 -20
- data/lib/watobo/gui/sites_tree.rb +118 -22
- data/lib/watobo/gui/status_bar.rb +8 -20
- data/lib/watobo/gui/table_editor.rb +76 -53
- data/lib/watobo/gui/tagless_viewer.rb +10 -21
- data/lib/watobo/gui/templates/plugin.rb +8 -20
- data/lib/watobo/gui/templates/plugin2.rb +99 -111
- data/lib/watobo/gui/templates/plugin_base.rb +152 -164
- data/lib/watobo/gui/text_viewer.rb +8 -20
- data/lib/watobo/gui/transcoder_window.rb +15 -22
- data/lib/watobo/gui/utils/gui_utils.rb +8 -20
- data/lib/watobo/gui/utils/init_icons.rb +94 -106
- data/lib/watobo/gui/utils/load_icons.rb +41 -53
- data/lib/watobo/gui/utils/load_plugins.rb +118 -130
- data/lib/watobo/gui/utils/master_password.rb +76 -88
- data/lib/watobo/gui/utils/save_default_settings.rb +121 -133
- data/lib/watobo/gui/utils/save_project_settings.rb +8 -20
- data/lib/watobo/gui/utils/save_proxy_settings.rb +53 -21
- data/lib/watobo/gui/utils/save_scanner_settings.rb +26 -38
- data/lib/watobo/gui/utils/session_history.rb +120 -132
- data/lib/watobo/gui/workspace_dialog.rb +8 -20
- data/lib/watobo/gui/www_auth_dialog.rb +8 -20
- data/lib/watobo/gui/xml_viewer_frame.rb +8 -20
- data/lib/watobo/http.rb +12 -23
- data/lib/watobo/http/cookies/cookies.rb +63 -70
- data/lib/watobo/http/data/data.rb +56 -64
- data/lib/watobo/http/data/json.rb +51 -0
- data/lib/watobo/http/url/url.rb +46 -58
- data/lib/watobo/http/xml/xml.rb +129 -141
- data/lib/watobo/interceptor.rb +11 -23
- data/lib/watobo/interceptor/proxy.rb +624 -625
- data/lib/watobo/interceptor/transparent.rb +22 -34
- data/lib/watobo/mixins.rb +18 -30
- data/lib/watobo/mixins/check_info.rb +35 -47
- data/lib/watobo/mixins/httpparser.rb +42 -35
- data/lib/watobo/mixins/request_parser.rb +8 -20
- data/lib/watobo/mixins/shapers.rb +484 -477
- data/lib/watobo/mixins/transcoders.rb +8 -20
- data/lib/watobo/parser.rb +9 -21
- data/lib/watobo/parser/html.rb +91 -103
- data/lib/watobo/sockets.rb +11 -23
- data/lib/watobo/sockets/agent.rb +836 -848
- data/lib/watobo/sockets/client_socket.rb +283 -277
- data/lib/watobo/sockets/connection.rb +409 -421
- data/lib/watobo/sockets/http_socket.rb +16 -23
- data/lib/watobo/sockets/ntlm_auth.rb +137 -149
- data/lib/watobo/utils.rb +18 -30
- data/lib/watobo/utils/check_regex.rb +8 -20
- data/lib/watobo/utils/copy_object.rb +8 -20
- data/lib/watobo/utils/crypto.rb +8 -20
- data/lib/watobo/utils/expand_range.rb +31 -43
- data/lib/watobo/utils/export_xml.rb +108 -0
- data/lib/watobo/utils/file_management.rb +8 -20
- data/lib/watobo/utils/hexprint.rb +17 -29
- data/lib/watobo/utils/load_chat.rb +8 -20
- data/lib/watobo/utils/load_icon.rb +8 -20
- data/lib/watobo/{external/ntlm → utils}/ntlm.rb +874 -796
- data/lib/watobo/utils/print_debug.rb +20 -32
- data/lib/watobo/utils/response_builder.rb +98 -110
- data/lib/watobo/utils/response_hash.rb +9 -20
- data/lib/watobo/utils/secure_eval.rb +10 -22
- data/lib/watobo/utils/strings.rb +18 -30
- data/lib/watobo/utils/text2request.rb +12 -20
- data/lib/watobo/utils/url.rb +31 -43
- data/lib/watobo/utils/utf16.rb +22 -0
- data/modules/active/Apache/mod_status.rb +9 -0
- data/modules/active/Apache/multiview.rb +161 -0
- data/modules/active/Flash/crossdomain.rb +9 -0
- data/modules/active/directories/dirwalker.rb +8 -20
- data/modules/active/discovery/fileextensions.rb +10 -22
- data/modules/active/discovery/http_methods.rb +8 -20
- data/modules/active/domino/domino_db.rb +8 -20
- data/modules/active/dotNET/custom_errors.rb +110 -122
- data/modules/active/dotNET/dotnet_files.rb +98 -110
- data/modules/active/fileinclusion/lfi_simple.rb +8 -20
- data/modules/active/jboss/jboss_basic.rb +8 -20
- data/modules/active/sap/business_objects.rb +63 -0
- data/modules/active/sap/its_commands.rb +8 -20
- data/modules/active/sap/its_service_parameter.rb +8 -20
- data/modules/active/sap/its_services.rb +8 -20
- data/modules/active/sap/its_xss.rb +8 -20
- data/modules/active/shell_shock/shell_shock.rb +149 -0
- data/modules/active/siebel/siebel_apps.rb +168 -180
- data/modules/active/sqlinjection/sql_boolean.rb +9 -21
- data/modules/active/sqlinjection/sqli_error.rb +10 -22
- data/modules/active/sqlinjection/sqli_timing.rb +228 -240
- data/modules/active/struts2/default_handler_ognl.rb +114 -126
- data/modules/active/struts2/include_params_ognl.rb +113 -125
- data/modules/active/xml/xml_xxe.rb +122 -127
- data/modules/active/xss/xss_ng.rb +223 -234
- data/modules/active/xss/xss_simple.rb +8 -20
- data/modules/passive/ajax.rb +76 -84
- data/modules/passive/autocomplete.rb +64 -76
- data/modules/passive/cookie_options.rb +8 -20
- data/modules/passive/cookie_xss.rb +9 -21
- data/modules/passive/detect_code.rb +9 -21
- data/modules/passive/detect_fileupload.rb +11 -22
- data/modules/passive/detect_infrastructure.rb +23 -35
- data/modules/passive/detect_one_time_tokens.rb +8 -20
- data/modules/passive/dirindexing.rb +9 -21
- data/modules/passive/disclosure_domino.rb +66 -79
- data/modules/passive/disclosure_emails.rb +9 -21
- data/modules/passive/disclosure_ipaddr.rb +15 -23
- data/modules/passive/filename_as_parameter.rb +8 -20
- data/modules/passive/form_spotter.rb +15 -21
- data/modules/passive/hidden_fields.rb +64 -70
- data/modules/passive/hotspots.rb +13 -22
- data/modules/passive/in_script_parameter.rb +15 -24
- data/modules/passive/multiple_server_headers.rb +8 -20
- data/modules/passive/possible_login.rb +12 -23
- data/modules/passive/redirect_url.rb +10 -22
- data/modules/passive/redirectionz.rb +9 -21
- data/modules/passive/sap-headers.rb +64 -76
- data/modules/passive/xss_dom.rb +10 -21
- data/plugins/catalog/catalog.rb +17 -23
- data/plugins/crawler/crawler.rb +12 -24
- data/plugins/crawler/gui.rb +13 -25
- data/plugins/crawler/gui/auth_frame.rb +278 -290
- data/plugins/crawler/gui/crawler_gui.rb +302 -320
- data/plugins/crawler/gui/general_settings_frame.rb +104 -116
- data/plugins/crawler/gui/hooks_frame.rb +88 -100
- data/plugins/crawler/gui/scope_frame.rb +58 -70
- data/plugins/crawler/gui/settings_tabbook.rb +46 -58
- data/plugins/crawler/gui/status_frame.rb +67 -78
- data/plugins/crawler/lib/bags.rb +26 -38
- data/plugins/crawler/lib/constants.rb +19 -31
- data/plugins/crawler/lib/engine.rb +505 -508
- data/plugins/crawler/lib/grabber.rb +77 -87
- data/plugins/crawler/lib/status.rb +82 -0
- data/plugins/crawler/lib/uri_mp.rb +20 -32
- data/plugins/filefinder/dbs/siebel_paths.txt +1118 -0
- data/plugins/filefinder/dbs/subs-big.lst +31986 -0
- data/plugins/filefinder/filefinder.rb +13 -23
- data/plugins/sqlmap/bin/test.rb +86 -98
- data/plugins/sqlmap/gui.rb +12 -24
- data/plugins/sqlmap/gui/main.rb +226 -238
- data/plugins/sqlmap/gui/options_frame.rb +105 -117
- data/plugins/sqlmap/lib/sqlmap_ctrl.rb +103 -115
- data/plugins/sqlmap/sqlmap.rb +10 -22
- data/plugins/sslchecker/cli/sslchecker_cli.rb +8 -20
- data/plugins/sslchecker/gui/cipher_table.rb +252 -264
- data/plugins/sslchecker/gui/gui.rb +267 -276
- data/plugins/sslchecker/gui/sslchecker.rb +12 -24
- data/plugins/sslchecker/lib/check.rb +172 -80
- data/plugins/wshell/gui/main.rb +115 -127
- data/plugins/wshell/lib/core.rb +85 -97
- data/plugins/wshell/wshell.rb +19 -31
- metadata +14 -6
- data/.yardopts +0 -24
@@ -1,61 +1,49 @@
|
|
1
|
-
|
1
|
+
#.
|
2
2
|
# settings_tabbook.rb
|
3
|
-
|
4
|
-
# Copyright
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# WATOBO is free software; you can redistribute it and/or modify
|
10
|
-
# it under the terms of the GNU General Public License as published by
|
11
|
-
# the Free Software Foundation version 2 of the License.
|
12
|
-
#
|
13
|
-
# WATOBO is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
#
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with WATOBO; if not, write to the Free Software
|
20
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
-
# .
|
22
|
-
# @private
|
23
|
-
module Watobo#:nodoc: all
|
24
|
-
module Plugin
|
25
|
-
module Crawler
|
26
|
-
class Gui
|
27
|
-
class SettingsTabBook < FXTabBook
|
28
|
-
attr :hooks, :general, :log_viewer, :auth, :scope
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(owner)
|
33
|
-
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
34
|
-
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
35
|
-
FXTabItem.new(self, "General", nil)
|
36
|
-
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
37
|
-
@general = GeneralSettingsFrame.new(self)
|
38
|
-
|
39
|
-
FXTabItem.new(self, "Scope", nil)
|
40
|
-
@scope = ScopeFrame.new(self)
|
41
|
-
|
42
|
-
FXTabItem.new(self, "Auth", nil)
|
43
|
-
@auth = AuthFrame.new(self)
|
3
|
+
#.
|
4
|
+
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
+
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
+
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
+
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
+
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
44
9
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
10
|
+
# @private
|
11
|
+
module Watobo#:nodoc: all
|
12
|
+
module Plugin
|
13
|
+
module Crawler
|
14
|
+
class Gui
|
15
|
+
class SettingsTabBook < FXTabBook
|
16
|
+
attr :hooks, :general, :log_viewer, :auth, :scope
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
def initialize(owner)
|
21
|
+
#@tab = FXTabBook.new(self, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
22
|
+
super(owner, nil, 0, LAYOUT_FILL_X|LAYOUT_FILL_Y|LAYOUT_RIGHT)
|
23
|
+
FXTabItem.new(self, "General", nil)
|
24
|
+
# frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_RAISED)
|
25
|
+
@general = GeneralSettingsFrame.new(self)
|
26
|
+
|
27
|
+
FXTabItem.new(self, "Scope", nil)
|
28
|
+
@scope = ScopeFrame.new(self)
|
29
|
+
|
30
|
+
FXTabItem.new(self, "Auth", nil)
|
31
|
+
@auth = AuthFrame.new(self)
|
32
|
+
|
33
|
+
|
34
|
+
FXTabItem.new(self, "Hooks", nil)
|
35
|
+
@hooks = HooksFrame.new(self)
|
36
|
+
|
37
|
+
FXTabItem.new(self, "Log", nil)
|
38
|
+
frame = FXVerticalFrame.new(self, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_THICK|FRAME_RAISED)
|
39
|
+
@log_viewer = Watobo::Gui::LogViewer.new(frame, :append, :opts => LAYOUT_FILL_X|LAYOUT_FILL_Y|FRAME_SUNKEN)
|
40
|
+
|
41
|
+
self.connect(SEL_COMMAND){
|
42
|
+
@hooks.selected if self.current == 3
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
61
49
|
end
|
@@ -1,82 +1,71 @@
|
|
1
|
-
|
1
|
+
#.
|
2
2
|
# status_frame.rb
|
3
|
-
|
4
|
-
# Copyright
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# WATOBO is free software; you can redistribute it and/or modify
|
10
|
-
# it under the terms of the GNU General Public License as published by
|
11
|
-
# the Free Software Foundation version 2 of the License.
|
12
|
-
#
|
13
|
-
# WATOBO is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
#
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with WATOBO; if not, write to the Free Software
|
20
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
-
# .
|
22
|
-
# @private
|
23
|
-
module Watobo#:nodoc: all
|
24
|
-
module Plugin
|
25
|
-
module Crawler
|
26
|
-
class Gui
|
27
|
-
class StatusFrame < FXHorizontalFrame
|
3
|
+
#.
|
4
|
+
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
+
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
+
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
+
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
+
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
28
9
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
10
|
+
# @private
|
11
|
+
module Watobo#:nodoc: all
|
12
|
+
module Plugin
|
13
|
+
module Crawler
|
14
|
+
class Gui
|
15
|
+
class StatusFrame < FXHorizontalFrame
|
16
|
+
|
17
|
+
include Watobo::Plugin::Crawler::Constants
|
18
|
+
# :engine_status => CRAWL_NONE,
|
19
|
+
# :page_size => 0,
|
20
|
+
# :link_size => 0,
|
21
|
+
# :skipped_domains => 0
|
34
22
|
def update_status(status)
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
frame =
|
68
|
-
|
69
|
-
@info_fields << (@
|
70
|
-
@info_fields << (@
|
71
|
-
@info_fields << (@
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
23
|
+
#puts status.to_yaml
|
24
|
+
if status.has_key? :engine_status
|
25
|
+
case status[:engine_status]
|
26
|
+
when CRAWL_NONE
|
27
|
+
self.backColor = self.parent.backColor
|
28
|
+
@status_txt.text = "Status: Idle"
|
29
|
+
when CRAWL_RUNNING
|
30
|
+
self.backColor = FXColor::Red
|
31
|
+
@status_txt.text = "Status: Running"
|
32
|
+
|
33
|
+
when CRAWL_PAUSED
|
34
|
+
self.backColor = FXColor::Yellow
|
35
|
+
@status_txt.text = "Status: Paused"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
if status.has_key? :link_size
|
40
|
+
@link_size_txt.text = "Links: #{status[:link_size]}"
|
41
|
+
end
|
42
|
+
|
43
|
+
if status.has_key? :page_size
|
44
|
+
@page_size_txt.text = "Pages: #{status[:page_size]}"
|
45
|
+
end
|
46
|
+
|
47
|
+
if status.has_key? :total_requests
|
48
|
+
@requests_txt.text = "Requests: #{status[:total_requests]}"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def initialize(owner)
|
53
|
+
super(owner, :opts => LAYOUT_FILL_X|FRAME_RAISED)
|
54
|
+
@info_fields = []
|
55
|
+
#frame = FXHorizontalFrame.new(, :opts => LAYOUT_FILL_Y, :padding => 0)
|
56
|
+
frame = self
|
57
|
+
@info_fields << ( @status_txt = FXLabel.new(frame, "Status: Stopped", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
58
|
+
@info_fields << (@link_size_txt = FXLabel.new(frame, "Links: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
59
|
+
@info_fields << (@page_size_txt = FXLabel.new(frame, "Pages: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 70) )
|
60
|
+
@info_fields << (@requests_txt = FXLabel.new(frame, "Requests: 0", :opts => FRAME_SUNKEN|LAYOUT_FIX_WIDTH, :width => 100) )
|
61
|
+
|
62
|
+
@info_fields.each do |i|
|
63
|
+
i.justify = JUSTIFY_LEFT
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
82
71
|
end
|
data/plugins/crawler/lib/bags.rb
CHANGED
@@ -1,41 +1,29 @@
|
|
1
|
-
|
1
|
+
#.
|
2
2
|
# bags.rb
|
3
|
-
|
4
|
-
# Copyright
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# WATOBO is free software; you can redistribute it and/or modify
|
10
|
-
# it under the terms of the GNU General Public License as published by
|
11
|
-
# the Free Software Foundation version 2 of the License.
|
12
|
-
#
|
13
|
-
# WATOBO is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
#
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with WATOBO; if not, write to the Free Software
|
20
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
-
# .
|
22
|
-
# @private
|
23
|
-
module Watobo#:nodoc: all
|
24
|
-
module Crawler
|
25
|
-
class PageBag
|
26
|
-
attr :page, :depth
|
27
|
-
def initialize(page, depth)
|
28
|
-
@page = page
|
29
|
-
@depth = depth
|
30
|
-
end
|
31
|
-
end
|
3
|
+
#.
|
4
|
+
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
+
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
+
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
+
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
+
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
32
9
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
10
|
+
# @private
|
11
|
+
module Watobo#:nodoc: all
|
12
|
+
module Crawler
|
13
|
+
class PageBag
|
14
|
+
attr :page, :depth
|
15
|
+
def initialize(page, depth)
|
16
|
+
@page = page
|
17
|
+
@depth = depth
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class LinkBag
|
22
|
+
attr :link, :depth
|
23
|
+
def initialize(link, depth)
|
24
|
+
@link = link
|
25
|
+
@depth = depth
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
41
29
|
end
|
@@ -1,34 +1,22 @@
|
|
1
|
-
|
1
|
+
#.
|
2
2
|
# constants.rb
|
3
|
-
|
4
|
-
# Copyright
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
|
10
|
-
# it under the terms of the GNU General Public License as published by
|
11
|
-
# the Free Software Foundation version 2 of the License.
|
12
|
-
#
|
13
|
-
# WATOBO is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
#
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with WATOBO; if not, write to the Free Software
|
20
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
-
# .
|
3
|
+
#.
|
4
|
+
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
+
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
+
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
+
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
+
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
9
|
+
|
22
10
|
# @private
|
23
|
-
module Watobo#:nodoc: all
|
24
|
-
module Plugin
|
25
|
-
module Crawler
|
26
|
-
module Constants
|
27
|
-
CRAWL_NONE = 0x00
|
28
|
-
CRAWL_RUNNING = 0x01
|
29
|
-
CRAWL_PAUSED = 0x02
|
30
|
-
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
11
|
+
module Watobo#:nodoc: all
|
12
|
+
module Plugin
|
13
|
+
module Crawler
|
14
|
+
module Constants
|
15
|
+
CRAWL_NONE = 0x00
|
16
|
+
CRAWL_RUNNING = 0x01
|
17
|
+
CRAWL_PAUSED = 0x02
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
34
22
|
end
|
@@ -1,520 +1,517 @@
|
|
1
|
-
|
1
|
+
#.
|
2
2
|
# engine.rb
|
3
|
-
|
4
|
-
# Copyright
|
5
|
-
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# WATOBO is free software; you can redistribute it and/or modify
|
10
|
-
# it under the terms of the GNU General Public License as published by
|
11
|
-
# the Free Software Foundation version 2 of the License.
|
12
|
-
#
|
13
|
-
# WATOBO is distributed in the hope that it will be useful,
|
14
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
-
# GNU General Public License for more details.
|
17
|
-
#
|
18
|
-
# You should have received a copy of the GNU General Public License
|
19
|
-
# along with WATOBO; if not, write to the Free Software
|
20
|
-
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
21
|
-
# .
|
22
|
-
# @private
|
23
|
-
module Watobo#:nodoc: all
|
24
|
-
module Crawler
|
25
|
-
|
26
|
-
class Agent < Mechanize
|
27
|
-
|
28
|
-
def initialize(opts)
|
29
|
-
super()
|
30
|
-
|
31
|
-
|
32
|
-
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
33
|
-
self.ignore_bad_chunking = true
|
34
|
-
self.keep_alive = false
|
35
|
-
|
36
|
-
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
37
|
-
|
38
|
-
if opts.has_key? :username and opts.has_key? :password
|
39
|
-
unless opts[:username].empty? and opts[:password].empty?
|
40
|
-
|
41
|
-
user = opts[:username]
|
42
|
-
pw = opts[:password]
|
43
|
-
uri = opts[:auth_uri]
|
44
|
-
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
45
|
-
self.add_auth(uri, user , pw )
|
46
|
-
# TODO: remove this workaround for a Mechanize Bug (#243)
|
47
|
-
p = self.get uri
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
52
|
-
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
53
|
-
end
|
54
|
-
|
55
|
-
if opts.has_key? :pre_connect_hook
|
56
|
-
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
57
|
-
end
|
58
|
-
|
59
|
-
unless opts[:cookie_jar].nil?
|
60
|
-
clean_jar = Mechanize::CookieJar.new
|
61
|
-
opts[:cookie_jar].each{ |cookie|
|
62
|
-
clean_jar.add! cookie
|
63
|
-
}
|
64
|
-
self.cookie_jar = clean_jar
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
|
71
|
-
class Engine
|
72
|
-
include Watobo::Plugin::Crawler::Constants
|
73
|
-
|
74
|
-
def subscribe(event, &callback)
|
75
|
-
(@event_dispatcher_listeners[event] ||= []) << callback
|
76
|
-
end
|
77
|
-
|
78
|
-
def clearEvents(event)
|
79
|
-
@event_dispatcher_listeners[event] ||= []
|
80
|
-
@event_dispatcher_listeners[event].clear
|
81
|
-
end
|
82
|
-
|
83
|
-
def notify(event, *args)
|
84
|
-
if @event_dispatcher_listeners[event]
|
85
|
-
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
86
|
-
@event_dispatcher_listeners[event].each do |m|
|
87
|
-
m.call(*args) if m.respond_to? :call
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
def settings
|
93
|
-
@opts
|
94
|
-
end
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
def get_page(url, opts={})
|
99
|
-
ro = {}.update @opts
|
100
|
-
ro.update opts
|
101
|
-
agent = Crawler::Agent.new(ro)
|
102
|
-
page = nil
|
103
|
-
page = agent.get url
|
104
|
-
return agent, page
|
105
|
-
end
|
106
|
-
|
107
|
-
def initialize(opts={})
|
108
|
-
@event_dispatcher_listeners = Hash.new
|
109
|
-
@status_lock = Mutex.new
|
110
|
-
|
111
|
-
@opts = {
|
112
|
-
:submit_forms => true,
|
113
|
-
:max_depth => 5,
|
114
|
-
:max_repeat => 20,
|
115
|
-
:max_threads => 4,
|
116
|
-
:user_agent => "watobo-crawler",
|
117
|
-
:proxy_host => '127.0.0.1',
|
118
|
-
:proxy_port => Watobo::Conf::Interceptor.port,
|
119
|
-
:delay => 0,
|
120
|
-
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
121
|
-
:allowed_hosts => [], # regex's
|
122
|
-
:allowed_urls => [], # regex's
|
123
|
-
:excluded_urls => ["logout"], # regex's
|
124
|
-
:excluded_fields => ["userid","username","password"], # regex's'
|
125
|
-
:excluded_form_names => [], # regex's'
|
126
|
-
:root_path => "", # regex
|
127
|
-
:username => "",
|
128
|
-
:password => "",
|
129
|
-
:auth_uri => nil,
|
130
|
-
:auth_domain => "", # for ntlm auth
|
131
|
-
:cookie_jar => nil
|
132
|
-
}
|
133
|
-
|
134
|
-
@opts.update opts
|
135
|
-
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
136
|
-
|
137
|
-
@stats = {
|
138
|
-
:total_requests => 0
|
139
|
-
}
|
140
|
-
|
141
|
-
@link_keys = Hash.new
|
142
|
-
@link_counts = Hash.new
|
143
|
-
|
144
|
-
@form_keys = Hash.new
|
145
|
-
@form_counts = Hash.new
|
146
|
-
|
147
|
-
end
|
148
|
-
|
149
|
-
def pause
|
150
|
-
false
|
151
|
-
end
|
152
|
-
|
153
|
-
def cancel
|
154
|
-
puts "[CRAWLER] - CANCEL!!"
|
155
|
-
@status_lock.synchronize do
|
156
|
-
@engine_status = CRAWL_NONE
|
157
|
-
end
|
158
|
-
@grabber_threads.each do |gt|
|
159
|
-
puts "Killing Thread #{gt}"
|
160
|
-
gt.kill
|
161
|
-
gt.raise "CANCEL"
|
162
|
-
end
|
163
|
-
@grabber_threads.each{|t| t.join }
|
164
|
-
|
165
|
-
@link_queue.clear
|
166
|
-
@page_queue.clear
|
167
|
-
@grabber_threads.clear
|
168
|
-
@link_keys.clear
|
169
|
-
@link_counts.clear
|
170
|
-
|
171
|
-
@form_keys.clear
|
172
|
-
@form_counts.clear
|
173
|
-
|
174
|
-
notify( :update_status, current_status )
|
175
|
-
puts "CANCELED - CANCELED"
|
176
|
-
# exit
|
177
|
-
end
|
3
|
+
#.
|
4
|
+
# Copyright 2014 by siberas, http://www.siberas.de
|
5
|
+
# This file is part of WATOBO (Web Application Tool Box) http://watobo.sourceforge.com
|
6
|
+
# WATOBO is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License.
|
7
|
+
# WATOBO is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
8
|
+
# You should have received a copy of the GNU General Public License along with WATOBO; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
178
9
|
|
10
|
+
# @private
|
11
|
+
module Watobo#:nodoc: all
|
12
|
+
module Crawler
|
13
|
+
|
14
|
+
class Agent < Mechanize
|
15
|
+
|
16
|
+
def initialize(opts)
|
17
|
+
super()
|
18
|
+
|
19
|
+
|
20
|
+
self.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
21
|
+
self.ignore_bad_chunking = true
|
22
|
+
self.keep_alive = false
|
23
|
+
|
24
|
+
self.user_agent = opts[:user_agent] if opts.has_key?(:user_agent)
|
25
|
+
|
26
|
+
if opts.has_key? :username and opts.has_key? :password
|
27
|
+
unless opts[:username].empty? and opts[:password].empty?
|
28
|
+
|
29
|
+
user = opts[:username]
|
30
|
+
pw = opts[:password]
|
31
|
+
uri = opts[:auth_uri]
|
32
|
+
# puts "Got Credentials for #{uri}: #{user} / #{pw}"
|
33
|
+
self.add_auth(uri, user , pw )
|
34
|
+
# TODO: remove this workaround for a Mechanize Bug (#243)
|
35
|
+
p = self.get uri
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
if ( opts.has_key? :proxy_host ) && ( opts.has_key? :proxy_port )
|
40
|
+
self.set_proxy( opts[:proxy_host], opts[:proxy_port] )
|
41
|
+
end
|
42
|
+
|
43
|
+
if opts.has_key? :pre_connect_hook
|
44
|
+
self.pre_connect_hooks << opts[:pre_connect_hook] if opts[:pre_connect_hook].respond_to? :call
|
45
|
+
end
|
46
|
+
|
47
|
+
unless opts[:cookie_jar].nil?
|
48
|
+
clean_jar = Mechanize::CookieJar.new
|
49
|
+
opts[:cookie_jar].each{ |cookie|
|
50
|
+
clean_jar.add! cookie
|
51
|
+
}
|
52
|
+
self.cookie_jar = clean_jar
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
class Engine
|
60
|
+
include Watobo::Plugin::Crawler::Constants
|
61
|
+
|
62
|
+
def subscribe(event, &callback)
|
63
|
+
(@event_dispatcher_listeners[event] ||= []) << callback
|
64
|
+
end
|
65
|
+
|
66
|
+
def clearEvents(event)
|
67
|
+
@event_dispatcher_listeners[event] ||= []
|
68
|
+
@event_dispatcher_listeners[event].clear
|
69
|
+
end
|
70
|
+
|
71
|
+
def notify(event, *args)
|
72
|
+
if @event_dispatcher_listeners[event]
|
73
|
+
# puts "NOTIFY: #{self}(:#{event}) [#{@event_dispatcher_listeners[event].length}]" if $DEBUG
|
74
|
+
@event_dispatcher_listeners[event].each do |m|
|
75
|
+
m.call(*args) if m.respond_to? :call
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def settings
|
81
|
+
@opts
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
def get_page(url, opts={})
|
87
|
+
ro = {}.update @opts
|
88
|
+
ro.update opts
|
89
|
+
agent = Crawler::Agent.new(ro)
|
90
|
+
page = nil
|
91
|
+
page = agent.get url
|
92
|
+
return agent, page
|
93
|
+
end
|
94
|
+
|
95
|
+
def initialize(opts={})
|
96
|
+
@event_dispatcher_listeners = Hash.new
|
97
|
+
@status_lock = Mutex.new
|
98
|
+
|
99
|
+
@opts = {
|
100
|
+
:submit_forms => true,
|
101
|
+
:max_depth => 5,
|
102
|
+
:max_repeat => 20,
|
103
|
+
:max_threads => 4,
|
104
|
+
:user_agent => "watobo-crawler",
|
105
|
+
:proxy_host => '127.0.0.1',
|
106
|
+
:proxy_port => Watobo::Conf::Interceptor.port,
|
107
|
+
:delay => 0,
|
108
|
+
:head_request_pattern => '(pdf|swf|doc|flv|jpg|png|gif)',
|
109
|
+
:allowed_hosts => [], # regex's
|
110
|
+
:allowed_urls => [], # regex's
|
111
|
+
:excluded_urls => ["logout"], # regex's
|
112
|
+
:excluded_fields => ["userid","username","password"], # regex's'
|
113
|
+
:excluded_form_names => [], # regex's'
|
114
|
+
:root_path => "", # regex
|
115
|
+
:username => "",
|
116
|
+
:password => "",
|
117
|
+
:auth_uri => nil,
|
118
|
+
:auth_domain => "", # for ntlm auth
|
119
|
+
:cookie_jar => nil
|
120
|
+
}
|
121
|
+
|
122
|
+
@opts.update opts
|
123
|
+
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
124
|
+
|
125
|
+
@stats = {
|
126
|
+
:total_requests => 0
|
127
|
+
}
|
128
|
+
|
129
|
+
@link_keys = Hash.new
|
130
|
+
@link_counts = Hash.new
|
131
|
+
|
132
|
+
@form_keys = Hash.new
|
133
|
+
@form_counts = Hash.new
|
134
|
+
|
135
|
+
end
|
136
|
+
|
137
|
+
def pause
|
138
|
+
false
|
139
|
+
end
|
140
|
+
|
141
|
+
def cancel
|
142
|
+
puts "[CRAWLER] - CANCEL!!"
|
143
|
+
#@status_lock.synchronize do
|
144
|
+
# @engine_status = CRAWL_NONE
|
145
|
+
#end
|
146
|
+
Watobo::Crawler::Status.engine = CRAWL_NONE
|
147
|
+
@grabber_threads.each do |gt|
|
148
|
+
puts "Killing Thread #{gt}"
|
149
|
+
gt.kill
|
150
|
+
gt.raise "CANCEL"
|
151
|
+
end
|
152
|
+
@grabber_threads.each{|t| t.join }
|
153
|
+
|
154
|
+
@link_queue.clear
|
155
|
+
@page_queue.clear
|
156
|
+
@grabber_threads.clear
|
157
|
+
@link_keys.clear
|
158
|
+
@link_counts.clear
|
159
|
+
|
160
|
+
@form_keys.clear
|
161
|
+
@form_counts.clear
|
162
|
+
|
163
|
+
#notify( :update_status, current_status )
|
164
|
+
puts "CANCELED - CANCELED"
|
165
|
+
# exit
|
166
|
+
end
|
167
|
+
|
179
168
|
def run(url, opts={})
|
180
|
-
|
181
|
-
|
182
|
-
|
169
|
+
#engine_status = CRAWL_RUNNING
|
170
|
+
Watobo::Crawler::Status.reset
|
171
|
+
Watobo::Crawler::Status.engine = CRAWL_RUNNING
|
172
|
+
|
173
|
+
@opts.update opts
|
183
174
|
@opts[:head_request_pattern] = '' if @opts[:head_request_pattern].nil?
|
184
175
|
|
185
176
|
puts "crawler settings:"
|
186
177
|
puts @opts.to_json
|
187
|
-
|
188
|
-
|
189
|
-
@link_queue = Queue.new
|
190
|
-
@page_queue = Queue.new
|
191
|
-
@link_keys = Hash.new
|
192
|
-
@link_counts = Hash.new
|
193
|
-
|
194
|
-
@form_keys = Hash.new
|
195
|
-
@form_counts = Hash.new
|
178
|
+
|
179
|
+
|
180
|
+
@link_queue = Queue.new
|
181
|
+
@page_queue = Queue.new
|
196
182
|
|
197
|
-
@
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
@
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
183
|
+
@link_keys = Hash.new
|
184
|
+
@link_counts = Hash.new
|
185
|
+
|
186
|
+
@form_keys = Hash.new
|
187
|
+
@form_counts = Hash.new
|
188
|
+
|
189
|
+
@skipped_sites = Hash.new
|
190
|
+
|
191
|
+
@grabber_threads = []
|
192
|
+
start_link = URI.parse url
|
193
|
+
return false if start_link.host.nil?
|
194
|
+
|
195
|
+
allow_host(start_link)
|
196
|
+
|
197
|
+
@link_queue.enq LinkBag.new(start_link, 0)
|
198
|
+
|
199
|
+
|
200
|
+
notify(:log, "Crawling #{url} started ..." )
|
201
|
+
|
202
|
+
@opts[:max_threads].times do |i|
|
203
|
+
g = Grabber.new(@link_queue, @page_queue, @opts )
|
204
|
+
@grabber_threads << g.run
|
205
|
+
end
|
206
|
+
|
207
|
+
puts "* startet #{@grabber_threads.length} grabbers"
|
208
|
+
|
209
|
+
loop do
|
210
|
+
pagebag = @page_queue.deq
|
211
|
+
|
212
|
+
process_links(pagebag)
|
213
|
+
|
214
|
+
process_forms(pagebag)
|
215
|
+
#@stats[:total_requests] += 1 unless pagebag.nil?
|
216
|
+
Watobo::Crawler::Status.inc_requests() unless pagebag.nil?
|
217
|
+
Watobo::Crawler::Status.page_size= @page_queue.size
|
218
|
+
Watobo::Crawler::Status.link_size= @link_queue.size
|
219
|
+
|
220
|
+
puts "Links/Pages: #{@link_queue.size}/#{@page_queue.size}"
|
221
|
+
#notify( :update_status, current_status )
|
222
|
+
# if @link_queue.empty? and @page_queue.empty?
|
223
|
+
if @page_queue.empty?
|
224
|
+
# if page_queue is empty wait for all grabber threads finishing the link_queue
|
225
|
+
until @link_queue.num_waiting == @grabber_threads.length
|
226
|
+
Thread.pass
|
227
|
+
end
|
228
|
+
# when the link_queue is finished check the page_queue. Crawling is finished if page_queue is empty too.
|
229
|
+
if @page_queue.empty?
|
230
|
+
@grabber_threads.each { |t| t.kill }
|
231
|
+
puts "Finished Crawling"
|
232
|
+
#@status_lock.synchronize{ @engine_status = CRAWL_NONE }
|
233
|
+
Watobo::Crawler::Status.engine = CRAWL_NONE
|
234
|
+
|
235
|
+
notify(:log, "Crawling finished")
|
236
|
+
#notify( :update_status, current_status )
|
237
|
+
break
|
238
|
+
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
end
|
243
|
+
|
244
|
+
end
|
245
|
+
|
246
|
+
private
|
247
|
+
|
248
|
+
def current_status
|
252
249
|
{
|
253
|
-
:engine_status => @engine_status,
|
254
|
-
:link_size => @link_queue.size,
|
255
|
-
:page_size => @page_queue.size
|
256
|
-
}.update @stats
|
257
|
-
|
258
|
-
end
|
259
|
-
|
260
|
-
|
261
|
-
def allow_host(uri)
|
262
|
-
if uri.is_a? URI
|
263
|
-
site = uri.site.to_s
|
264
|
-
# puts "Valid Site: #{site}"
|
265
|
-
ah = allowed_hosts
|
266
|
-
ah << site
|
267
|
-
end
|
268
|
-
end
|
269
|
-
|
270
|
-
def process_forms(pagebag)
|
271
|
-
return false unless pagebag.respond_to? :page
|
272
|
-
page=pagebag.page
|
273
|
-
return false unless page.respond_to? :forms
|
274
|
-
page.forms.each do |f|
|
275
|
-
|
276
|
-
action = page.uri.merge f.action unless f.action =~ /^http/
|
277
|
-
f.action = action.to_s
|
278
|
-
|
279
|
-
if send_form? f
|
280
|
-
# puts "SUBMIT FORM: #{f.action}"
|
281
|
-
send_form(f, pagebag.depth)
|
282
|
-
end
|
283
|
-
end
|
284
|
-
end
|
285
|
-
|
286
|
-
def process_links(pagebag)
|
287
|
-
return false unless pagebag.respond_to? :page
|
288
|
-
page = pagebag.page
|
289
|
-
return false unless page.respond_to? :links
|
290
|
-
|
291
|
-
page.links.each do |l|
|
292
|
-
begin
|
250
|
+
:engine_status => @engine_status,
|
251
|
+
:link_size => @link_queue.size,
|
252
|
+
:page_size => @page_queue.size
|
253
|
+
}.update @stats
|
254
|
+
|
255
|
+
end
|
256
|
+
|
257
|
+
|
258
|
+
def allow_host(uri)
|
259
|
+
if uri.is_a? URI
|
260
|
+
site = uri.site.to_s
|
261
|
+
# puts "Valid Site: #{site}"
|
262
|
+
ah = allowed_hosts
|
263
|
+
ah << site
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def process_forms(pagebag)
|
268
|
+
return false unless pagebag.respond_to? :page
|
269
|
+
page=pagebag.page
|
270
|
+
return false unless page.respond_to? :forms
|
271
|
+
page.forms.each do |f|
|
272
|
+
|
273
|
+
action = page.uri.merge f.action unless f.action =~ /^http/
|
274
|
+
f.action = action.to_s
|
275
|
+
|
276
|
+
if send_form? f
|
277
|
+
# puts "SUBMIT FORM: #{f.action}"
|
278
|
+
send_form(f, pagebag.depth)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
def process_links(pagebag)
|
284
|
+
return false unless pagebag.respond_to? :page
|
285
|
+
page = pagebag.page
|
286
|
+
return false unless page.respond_to? :links
|
287
|
+
|
288
|
+
page.links.each do |l|
|
289
|
+
begin
|
293
290
|
link = l
|
294
291
|
next if l.href.nil?
|
295
|
-
|
296
|
-
link = page.uri.merge l.uri unless l.href =~ /^http/
|
297
|
-
# puts "FOLLOW LINK #{link} ?"
|
298
|
-
if follow_link? link
|
299
|
-
# puts ">> OK"
|
300
|
-
submit_link(link, pagebag.depth)
|
301
|
-
else
|
302
|
-
# puts ">> NO"
|
303
|
-
end
|
304
|
-
rescue => bang
|
292
|
+
|
293
|
+
link = page.uri.merge l.uri unless l.href =~ /^http/
|
294
|
+
# puts "FOLLOW LINK #{link} ?"
|
295
|
+
if follow_link? link
|
296
|
+
# puts ">> OK"
|
297
|
+
submit_link(link, pagebag.depth)
|
298
|
+
else
|
299
|
+
# puts ">> NO"
|
300
|
+
end
|
301
|
+
rescue => bang
|
305
302
|
puts bang
|
306
|
-
puts bang.backtrace if $DEBUG
|
307
|
-
end
|
308
|
-
end
|
309
|
-
|
310
|
-
end
|
311
|
-
|
312
|
-
|
313
|
-
def submit_link(link, depth)
|
314
|
-
# @link_keys[link_key(link)] = link
|
315
|
-
|
316
|
-
clk = link_key(link, :clear_values => true)
|
317
|
-
@link_counts[clk] ||= 0
|
318
|
-
@link_counts[clk] += 1
|
319
|
-
lk = link_key(link)
|
320
|
-
return false if @link_keys.has_key? lk
|
321
|
-
@link_keys[lk] = nil
|
322
|
-
if @link_counts[clk] < @opts[:max_repeat]
|
323
|
-
@link_queue.enq LinkBag.new(link, depth)
|
324
|
-
else
|
325
|
-
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
326
|
-
end
|
327
|
-
end
|
328
|
-
|
329
|
-
def form_key(form, opts={} )
|
330
|
-
o = { :clear_values => false }
|
331
|
-
o.update opts
|
332
|
-
|
333
|
-
fp = "#{form.action}"
|
334
|
-
fp << form.method
|
335
|
-
if form.request_data =~ /=/
|
336
|
-
data = form.request_data.split("&").sort.join("&")
|
337
|
-
if o[:clear_values]
|
338
|
-
fp << data.gsub(/=[^&]*/,'=')
|
339
|
-
else
|
340
|
-
fp << data
|
341
|
-
end
|
342
|
-
end
|
343
|
-
fkey = Digest::MD5.hexdigest fp
|
344
|
-
fkey
|
345
|
-
end
|
346
|
-
|
347
|
-
def send_form(form, depth)
|
348
|
-
return false if @engine_status == CRAWL_NONE
|
349
|
-
cfk = form_key(form, :clear_values => true)
|
350
|
-
@form_counts[cfk] ||= 0
|
351
|
-
@form_counts[cfk] += 1
|
352
|
-
|
353
|
-
# @form_keys[form_key(form)] = form
|
354
|
-
fk = form_key(form)
|
355
|
-
return false if @form_keys.has_key? fk
|
356
|
-
@form_keys[fk] = nil
|
357
|
-
begin
|
358
|
-
if @form_counts[cfk] < @opts[:max_repeat]
|
359
|
-
if form.buttons.length > 0
|
360
|
-
p = form.click_button
|
361
|
-
else
|
362
|
-
p = form.submit()
|
363
|
-
end
|
364
|
-
puts p.class
|
365
|
-
@page_queue.enq PageBag.new(p, depth+1)
|
366
|
-
else
|
367
|
-
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
368
|
-
end
|
369
|
-
rescue => bang
|
370
|
-
puts bang
|
371
|
-
puts bang.backtrace
|
372
|
-
end
|
373
|
-
end
|
374
|
-
|
375
|
-
def send_form?(form)
|
376
|
-
# puts "SEND FORM?"
|
377
|
-
return false unless engine_running?
|
378
|
-
return false unless @opts[:submit_forms] == true
|
379
|
-
# puts "> submit_forms"
|
380
|
-
return false unless allowed? form.action
|
381
|
-
#puts "> allowed"
|
382
|
-
return false unless fields_allowed? form
|
383
|
-
#puts "> fields allowed"
|
384
|
-
return false if form_sent? form
|
385
|
-
# puts "> form not sent"
|
386
|
-
return true
|
387
|
-
end
|
388
|
-
|
389
|
-
def follow_link?(link)
|
390
|
-
return false unless allowed? link
|
391
|
-
return false if link_is_followed? link
|
392
|
-
return true
|
393
|
-
end
|
394
|
-
|
395
|
-
def host_allowed?(uri)
|
396
|
-
#puts "ALLOWED HOSTS =>"
|
397
|
-
#puts allowed_hosts
|
398
|
-
#puts "---"
|
399
|
-
# puts "Host Allowed?"
|
400
|
-
ah = allowed_hosts
|
401
|
-
# puts ah.class
|
402
|
-
#puts ah
|
403
|
-
return false if ah.empty?
|
404
|
-
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
405
|
-
if ahc > 0
|
406
|
-
# puts "> Host IS allowed!"
|
407
|
-
return true
|
408
|
-
end
|
409
|
-
# puts "> Host is NOT allowed!"
|
410
|
-
return false
|
411
|
-
end
|
412
|
-
|
413
|
-
def url_allowed?(uri)
|
414
|
-
# puts "* excluded_urls"
|
415
|
-
# puts exluded_urls
|
416
|
-
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
417
|
-
# puts "* allowed_urls"
|
418
|
-
# puts allowed_urls
|
419
|
-
return true if allowed_urls.empty?
|
420
|
-
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
421
|
-
# puts "> URL is NOT allowed"
|
422
|
-
return false
|
423
|
-
end
|
424
|
-
|
425
|
-
def path_allowed?(uri)
|
426
|
-
return true if root_path.nil?
|
427
|
-
return true if root_path.empty?
|
428
|
-
return true if uri.path =~ /^#{root_path}/
|
429
|
-
# puts "> PATH is NOT ALLOWED"
|
430
|
-
return false
|
431
|
-
end
|
432
|
-
|
433
|
-
def cleanup_uri(obj)
|
434
|
-
uri = nil
|
435
|
-
uri = obj.uri if obj.respond_to? :uri
|
436
|
-
uri = URI.parse(obj) if obj.is_a? String
|
437
|
-
uri = obj if obj.is_a? URI::HTTP
|
438
|
-
uri
|
439
|
-
end
|
440
|
-
|
441
|
-
def allowed?(link)
|
442
|
-
valid = false
|
443
|
-
# need to handle different link objects, Mechanize::Page::Link and URIs
|
444
|
-
uri = nil
|
445
|
-
uri = link.uri if link.respond_to? :uri
|
446
|
-
uri = URI.parse(link) if link.is_a? String
|
447
|
-
uri = link if link.is_a? URI::HTTP
|
448
|
-
|
449
|
-
return false if uri.nil?
|
450
|
-
|
451
|
-
host_allowed?(uri) &&
|
452
|
-
url_allowed?(uri) &&
|
453
|
-
path_allowed?(uri)
|
454
|
-
end
|
455
|
-
|
456
|
-
def form_sent?(form)
|
457
|
-
|
458
|
-
@form_keys.has_key? form_key(form)
|
459
|
-
end
|
460
|
-
|
461
|
-
def link_key(link, opts={})
|
462
|
-
o = { :clear_values => false }
|
463
|
-
o.update opts
|
464
|
-
|
465
|
-
uri = cleanup_uri(link)
|
466
|
-
|
467
|
-
query_sorted = ""
|
468
|
-
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
469
|
-
|
470
|
-
key = ""
|
471
|
-
key << uri.scheme
|
472
|
-
key << uri.site
|
473
|
-
key << uri.path
|
474
|
-
key << query_sorted
|
475
|
-
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
476
|
-
|
477
|
-
Digest::MD5.hexdigest key
|
478
|
-
end
|
479
|
-
|
480
|
-
def engine_running?
|
481
|
-
@status_lock.synchronize do
|
482
|
-
return false if @engine_status == CRAWL_NONE
|
483
|
-
return true
|
484
|
-
end
|
485
|
-
end
|
486
|
-
|
487
|
-
def link_is_followed?(link)
|
488
|
-
|
489
|
-
return true if @link_keys.has_key? link_key(link)
|
490
|
-
|
491
|
-
false
|
492
|
-
end
|
493
|
-
|
494
|
-
def fields_allowed?(form)
|
495
|
-
form.fields.each do |f|
|
496
|
-
excluded_fields.each do |ef|
|
497
|
-
return false if f.name =~ /#{ef}/
|
498
|
-
end
|
499
|
-
end
|
500
|
-
return true
|
501
|
-
end
|
502
|
-
|
503
|
-
def method_missing(name, *args, &block)
|
504
|
-
# puts "* instance method missing (#{name})"
|
505
|
-
if name =~ /(.*)=$/
|
506
|
-
@opts.has_key? $1.to_sym || super
|
507
|
-
@opts[$1.to_sym] = args[0]
|
508
|
-
return @opts[$1.to_sym]
|
509
|
-
else
|
510
|
-
k = name.to_sym
|
511
|
-
@opts.has_key? k || super
|
512
|
-
# puts "Value Found For #{k.to_yaml}"
|
513
|
-
return @opts[k]
|
514
|
-
|
515
|
-
end
|
516
|
-
end
|
517
|
-
end
|
518
|
-
end
|
519
|
-
|
520
|
-
end
|
303
|
+
puts bang.backtrace if $DEBUG
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
end
|
308
|
+
|
309
|
+
|
310
|
+
def submit_link(link, depth)
|
311
|
+
# @link_keys[link_key(link)] = link
|
312
|
+
|
313
|
+
clk = link_key(link, :clear_values => true)
|
314
|
+
@link_counts[clk] ||= 0
|
315
|
+
@link_counts[clk] += 1
|
316
|
+
lk = link_key(link)
|
317
|
+
return false if @link_keys.has_key? lk
|
318
|
+
@link_keys[lk] = nil
|
319
|
+
if @link_counts[clk] < @opts[:max_repeat]
|
320
|
+
@link_queue.enq LinkBag.new(link, depth)
|
321
|
+
else
|
322
|
+
puts "! MAX REPEAT !\nSkipped link #{link}" if $DEBUG
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def form_key(form, opts={} )
|
327
|
+
o = { :clear_values => false }
|
328
|
+
o.update opts
|
329
|
+
|
330
|
+
fp = "#{form.action}"
|
331
|
+
fp << form.method
|
332
|
+
if form.request_data =~ /=/
|
333
|
+
data = form.request_data.split("&").sort.join("&")
|
334
|
+
if o[:clear_values]
|
335
|
+
fp << data.gsub(/=[^&]*/,'=')
|
336
|
+
else
|
337
|
+
fp << data
|
338
|
+
end
|
339
|
+
end
|
340
|
+
fkey = Digest::MD5.hexdigest fp
|
341
|
+
fkey
|
342
|
+
end
|
343
|
+
|
344
|
+
def send_form(form, depth)
|
345
|
+
return false if @engine_status == CRAWL_NONE
|
346
|
+
cfk = form_key(form, :clear_values => true)
|
347
|
+
@form_counts[cfk] ||= 0
|
348
|
+
@form_counts[cfk] += 1
|
349
|
+
|
350
|
+
# @form_keys[form_key(form)] = form
|
351
|
+
fk = form_key(form)
|
352
|
+
return false if @form_keys.has_key? fk
|
353
|
+
@form_keys[fk] = nil
|
354
|
+
begin
|
355
|
+
if @form_counts[cfk] < @opts[:max_repeat]
|
356
|
+
if form.buttons.length > 0
|
357
|
+
p = form.click_button
|
358
|
+
else
|
359
|
+
p = form.submit()
|
360
|
+
end
|
361
|
+
puts p.class
|
362
|
+
@page_queue.enq PageBag.new(p, depth+1)
|
363
|
+
else
|
364
|
+
puts "! MAX REPEAT !\nSkipped Form #{form.action}"
|
365
|
+
end
|
366
|
+
rescue => bang
|
367
|
+
puts bang
|
368
|
+
puts bang.backtrace
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
def send_form?(form)
|
373
|
+
# puts "SEND FORM?"
|
374
|
+
return false unless engine_running?
|
375
|
+
return false unless @opts[:submit_forms] == true
|
376
|
+
# puts "> submit_forms"
|
377
|
+
return false unless allowed? form.action
|
378
|
+
#puts "> allowed"
|
379
|
+
return false unless fields_allowed? form
|
380
|
+
#puts "> fields allowed"
|
381
|
+
return false if form_sent? form
|
382
|
+
# puts "> form not sent"
|
383
|
+
return true
|
384
|
+
end
|
385
|
+
|
386
|
+
def follow_link?(link)
|
387
|
+
return false unless allowed? link
|
388
|
+
return false if link_is_followed? link
|
389
|
+
return true
|
390
|
+
end
|
391
|
+
|
392
|
+
def host_allowed?(uri)
|
393
|
+
#puts "ALLOWED HOSTS =>"
|
394
|
+
#puts allowed_hosts
|
395
|
+
#puts "---"
|
396
|
+
# puts "Host Allowed?"
|
397
|
+
ah = allowed_hosts
|
398
|
+
# puts ah.class
|
399
|
+
#puts ah
|
400
|
+
return false if ah.empty?
|
401
|
+
ahc = ah.select{ |h| uri.site =~ /^#{h}$/ }.length
|
402
|
+
if ahc > 0
|
403
|
+
# puts "> Host IS allowed!"
|
404
|
+
return true
|
405
|
+
end
|
406
|
+
# puts "> Host is NOT allowed!"
|
407
|
+
return false
|
408
|
+
end
|
409
|
+
|
410
|
+
def url_allowed?(uri)
|
411
|
+
# puts "* excluded_urls"
|
412
|
+
# puts exluded_urls
|
413
|
+
return false if excluded_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
414
|
+
# puts "* allowed_urls"
|
415
|
+
# puts allowed_urls
|
416
|
+
return true if allowed_urls.empty?
|
417
|
+
return true if allowed_urls.select{ |url| uri.path =~ /#{url}/ }.length > 0
|
418
|
+
# puts "> URL is NOT allowed"
|
419
|
+
return false
|
420
|
+
end
|
421
|
+
|
422
|
+
def path_allowed?(uri)
|
423
|
+
return true if root_path.nil?
|
424
|
+
return true if root_path.empty?
|
425
|
+
return true if uri.path =~ /^#{root_path}/
|
426
|
+
# puts "> PATH is NOT ALLOWED"
|
427
|
+
return false
|
428
|
+
end
|
429
|
+
|
430
|
+
def cleanup_uri(obj)
|
431
|
+
uri = nil
|
432
|
+
uri = obj.uri if obj.respond_to? :uri
|
433
|
+
uri = URI.parse(obj) if obj.is_a? String
|
434
|
+
uri = obj if obj.is_a? URI::HTTP
|
435
|
+
uri
|
436
|
+
end
|
437
|
+
|
438
|
+
def allowed?(link)
|
439
|
+
valid = false
|
440
|
+
# need to handle different link objects, Mechanize::Page::Link and URIs
|
441
|
+
uri = nil
|
442
|
+
uri = link.uri if link.respond_to? :uri
|
443
|
+
uri = URI.parse(link) if link.is_a? String
|
444
|
+
uri = link if link.is_a? URI::HTTP
|
445
|
+
|
446
|
+
return false if uri.nil?
|
447
|
+
|
448
|
+
host_allowed?(uri) &&
|
449
|
+
url_allowed?(uri) &&
|
450
|
+
path_allowed?(uri)
|
451
|
+
end
|
452
|
+
|
453
|
+
def form_sent?(form)
|
454
|
+
|
455
|
+
@form_keys.has_key? form_key(form)
|
456
|
+
end
|
457
|
+
|
458
|
+
def link_key(link, opts={})
|
459
|
+
o = { :clear_values => false }
|
460
|
+
o.update opts
|
461
|
+
|
462
|
+
uri = cleanup_uri(link)
|
463
|
+
|
464
|
+
query_sorted = ""
|
465
|
+
query_sorted = uri.query.split("&").sort.join("&") unless uri.query.nil?
|
466
|
+
|
467
|
+
key = ""
|
468
|
+
key << uri.scheme
|
469
|
+
key << uri.site
|
470
|
+
key << uri.path
|
471
|
+
key << query_sorted
|
472
|
+
key.gsub!(/=[^&]*/,'=') if o[:clear_values] == true
|
473
|
+
|
474
|
+
Digest::MD5.hexdigest key
|
475
|
+
end
|
476
|
+
|
477
|
+
def engine_running?
|
478
|
+
@status_lock.synchronize do
|
479
|
+
return false if @engine_status == CRAWL_NONE
|
480
|
+
return true
|
481
|
+
end
|
482
|
+
end
|
483
|
+
|
484
|
+
def link_is_followed?(link)
|
485
|
+
|
486
|
+
return true if @link_keys.has_key? link_key(link)
|
487
|
+
|
488
|
+
false
|
489
|
+
end
|
490
|
+
|
491
|
+
def fields_allowed?(form)
|
492
|
+
form.fields.each do |f|
|
493
|
+
excluded_fields.each do |ef|
|
494
|
+
return false if f.name =~ /#{ef}/
|
495
|
+
end
|
496
|
+
end
|
497
|
+
return true
|
498
|
+
end
|
499
|
+
|
500
|
+
def method_missing(name, *args, &block)
|
501
|
+
# puts "* instance method missing (#{name})"
|
502
|
+
if name =~ /(.*)=$/
|
503
|
+
@opts.has_key? $1.to_sym || super
|
504
|
+
@opts[$1.to_sym] = args[0]
|
505
|
+
return @opts[$1.to_sym]
|
506
|
+
else
|
507
|
+
k = name.to_sym
|
508
|
+
@opts.has_key? k || super
|
509
|
+
# puts "Value Found For #{k.to_yaml}"
|
510
|
+
return @opts[k]
|
511
|
+
|
512
|
+
end
|
513
|
+
end
|
514
|
+
end
|
515
|
+
end
|
516
|
+
|
517
|
+
end
|