pstuteville-scrubyt 0.4.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (520) hide show
  1. data/CHANGELOG +355 -0
  2. data/COPYING +340 -0
  3. data/README.rdoc +121 -0
  4. data/Rakefile +120 -0
  5. data/VERSION +1 -0
  6. data/examples/README.txt +1 -0
  7. data/examples/events/delta/input.html +682 -0
  8. data/examples/events/delta/test.rb +16 -0
  9. data/examples/misc/auto_regex/input.html +22 -0
  10. data/examples/misc/auto_regex/test.rb +14 -0
  11. data/examples/misc/compound_example/advanced/test.rb +11 -0
  12. data/examples/misc/compound_example/advanced/tricky_compound.html +9 -0
  13. data/examples/misc/compound_example/regexp/regexp_compound.html +17 -0
  14. data/examples/misc/compound_example/regexp/test.rb +11 -0
  15. data/examples/misc/compound_example/simple/compound.html +5 -0
  16. data/examples/misc/compound_example/simple/test.rb +11 -0
  17. data/examples/misc/detail_page/detailpage.html +6 -0
  18. data/examples/misc/detail_page/index.html +9 -0
  19. data/examples/misc/detail_page/test.rb +17 -0
  20. data/examples/misc/google/test.rb +39 -0
  21. data/examples/misc/identical_examples/data_extractor_export.rb +12 -0
  22. data/examples/misc/identical_examples/input.html +16 -0
  23. data/examples/misc/identical_examples/test.rb +15 -0
  24. data/examples/misc/immediate_attribute_extraction/data_extractor_export.rb +10 -0
  25. data/examples/misc/immediate_attribute_extraction/input.html +16 -0
  26. data/examples/misc/immediate_attribute_extraction/test.rb +14 -0
  27. data/examples/misc/multiple_examples/input.html +7 -0
  28. data/examples/misc/multiple_examples/test.rb +22 -0
  29. data/examples/misc/on_click_next/next_page_link.rb +42 -0
  30. data/examples/misc/on_click_next/page_1.html +10 -0
  31. data/examples/misc/on_click_next/page_2.html +10 -0
  32. data/examples/misc/on_click_next/page_3.html +7 -0
  33. data/examples/misc/rubycorner/test.rb +29 -0
  34. data/examples/misc/rubyforge_login/test.rb +30 -0
  35. data/examples/misc/tables/ambigous_records/input.html +17 -0
  36. data/examples/misc/tables/ambigous_records/test.rb +37 -0
  37. data/examples/misc/tables/another_plain_table/input.html +15 -0
  38. data/examples/misc/tables/another_plain_table/test.rb +25 -0
  39. data/examples/misc/tables/complex_table/input.html +45 -0
  40. data/examples/misc/tables/complex_table/test.rb +30 -0
  41. data/examples/misc/tables/grab_rows/input.html +20 -0
  42. data/examples/misc/tables/grab_rows/test.rb +30 -0
  43. data/examples/misc/tables/plain_table/input.html +39 -0
  44. data/examples/misc/tables/plain_table/test.rb +35 -0
  45. data/examples/misc/tables/plain_table_morepages/2.html +38 -0
  46. data/examples/misc/tables/plain_table_morepages/3.html +33 -0
  47. data/examples/misc/tables/plain_table_morepages/input.html +40 -0
  48. data/examples/misc/tables/plain_table_morepages/test.rb +32 -0
  49. data/examples/misc/tables/plain_table_morepages_with_image/2.html +40 -0
  50. data/examples/misc/tables/plain_table_morepages_with_image/3.html +33 -0
  51. data/examples/misc/tables/plain_table_morepages_with_image/images/right_arrow.png +0 -0
  52. data/examples/misc/tables/plain_table_morepages_with_image/input.html +42 -0
  53. data/examples/misc/tables/plain_table_morepages_with_image/test.rb +32 -0
  54. data/examples/misc/tables/test_select_indices/input.html +46 -0
  55. data/examples/misc/tables/test_select_indices/test.rb +55 -0
  56. data/examples/misc/xpath_example_type/input.html +15 -0
  57. data/examples/misc/xpath_example_type/test.rb +18 -0
  58. data/examples/misc/yahoo_finance/test.rb +26 -0
  59. data/examples/social/blog_comment/test.rb +27 -0
  60. data/examples/social/del.icio.us/test.rb +22 -0
  61. data/examples/social/digg/test.rb +37 -0
  62. data/examples/social/dzone/test.rb +28 -0
  63. data/examples/social/linkedin/linkedin.rb +23 -0
  64. data/examples/social/reddit/test.rb +23 -0
  65. data/examples/tones_extractor_export.rb +23 -0
  66. data/examples/webshops/amazon/002-8212888-3924065.html +5311 -0
  67. data/examples/webshops/amazon/002-8212888-3924065_files/0130796034.jpg +0 -0
  68. data/examples/webshops/amazon/002-8212888-3924065_files/020161622X.jpg +0 -0
  69. data/examples/webshops/amazon/002-8212888-3924065_files/0321223675.jpg +0 -0
  70. data/examples/webshops/amazon/002-8212888-3924065_files/0465067107.jpg +0 -0
  71. data/examples/webshops/amazon/002-8212888-3924065_files/0470069155.jpg +0 -0
  72. data/examples/webshops/amazon/002-8212888-3924065_files/0470081201.jpg +0 -0
  73. data/examples/webshops/amazon/002-8212888-3924065_files/0596005253.jpg +0 -0
  74. data/examples/webshops/amazon/002-8212888-3924065_files/0596101325.jpg +0 -0
  75. data/examples/webshops/amazon/002-8212888-3924065_files/0596523696.jpg +0 -0
  76. data/examples/webshops/amazon/002-8212888-3924065_files/0672328844.jpg +0 -0
  77. data/examples/webshops/amazon/002-8212888-3924065_files/0764596861.jpg +0 -0
  78. data/examples/webshops/amazon/002-8212888-3924065_files/0974514055.jpg +0 -0
  79. data/examples/webshops/amazon/002-8212888-3924065_files/0976694069.jpg +0 -0
  80. data/examples/webshops/amazon/002-8212888-3924065_files/0977616606.jpg +0 -0
  81. data/examples/webshops/amazon/002-8212888-3924065_files/0977616614.jpg +0 -0
  82. data/examples/webshops/amazon/002-8212888-3924065_files/0977616630.jpg +0 -0
  83. data/examples/webshops/amazon/002-8212888-3924065_files/1590597362.jpg +0 -0
  84. data/examples/webshops/amazon/002-8212888-3924065_files/1594480060.jpg +0 -0
  85. data/examples/webshops/amazon/002-8212888-3924065_files/1932394699.jpg +0 -0
  86. data/examples/webshops/amazon/002-8212888-3924065_files/2841772101.jpg +0 -0
  87. data/examples/webshops/amazon/002-8212888-3924065_files/amzn-logo-118w.gif +0 -0
  88. data/examples/webshops/amazon/002-8212888-3924065_files/askville-adwidget-bullet.gif +0 -0
  89. data/examples/webshops/amazon/002-8212888-3924065_files/askville-logo-sm-adwidget-white-bg.gif +0 -0
  90. data/examples/webshops/amazon/002-8212888-3924065_files/book_display_on_website-icon.gif +0 -0
  91. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-no-ns.gif +0 -0
  92. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-no.gif +0 -0
  93. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-yes-ns.gif +0 -0
  94. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-yes.gif +0 -0
  95. data/examples/webshops/amazon/002-8212888-3924065_files/btn-no-tiny.gif +0 -0
  96. data/examples/webshops/amazon/002-8212888-3924065_files/btn-yes-tiny.gif +0 -0
  97. data/examples/webshops/amazon/002-8212888-3924065_files/buybox-button-find-gifts-a.gif +0 -0
  98. data/examples/webshops/amazon/002-8212888-3924065_files/c7y_badge_rn_1.gif +0 -0
  99. data/examples/webshops/amazon/002-8212888-3924065_files/cap-a9-3.gif +0 -0
  100. data/examples/webshops/amazon/002-8212888-3924065_files/drop-down-icon-small-arrow.gif +0 -0
  101. data/examples/webshops/amazon/002-8212888-3924065_files/endcap-a9-go-2.gif +0 -0
  102. data/examples/webshops/amazon/002-8212888-3924065_files/gb-open-new.gif +0 -0
  103. data/examples/webshops/amazon/002-8212888-3924065_files/gc-logo-popover-a.gif +0 -0
  104. data/examples/webshops/amazon/002-8212888-3924065_files/gift-cert-roto-pop-a.gif +0 -0
  105. data/examples/webshops/amazon/002-8212888-3924065_files/go-button-books.gif +0 -0
  106. data/examples/webshops/amazon/002-8212888-3924065_files/go-button.gif +0 -0
  107. data/examples/webshops/amazon/002-8212888-3924065_files/go-orange-trans.gif +0 -0
  108. data/examples/webshops/amazon/002-8212888-3924065_files/go_button_photo.gif +0 -0
  109. data/examples/webshops/amazon/002-8212888-3924065_files/logo-off.gif +0 -0
  110. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-events-18134.js +1407 -0
  111. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-n2v1-57871.css +364 -0
  112. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-simplePopover-41153.js +749 -0
  113. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-utilities-25439.js +1608 -0
  114. data/examples/webshops/amazon/002-8212888-3924065_files/orange-arrow.gif +0 -0
  115. data/examples/webshops/amazon/002-8212888-3924065_files/orange-arrow_002.gif +0 -0
  116. data/examples/webshops/amazon/002-8212888-3924065_files/popover-blurb.gif +0 -0
  117. data/examples/webshops/amazon/002-8212888-3924065_files/powered-by-a9.gif +0 -0
  118. data/examples/webshops/amazon/002-8212888-3924065_files/stars-3-5.gif +0 -0
  119. data/examples/webshops/amazon/002-8212888-3924065_files/stars-4-0.gif +0 -0
  120. data/examples/webshops/amazon/002-8212888-3924065_files/stars-4-5.gif +0 -0
  121. data/examples/webshops/amazon/002-8212888-3924065_files/stars-5-0.gif +0 -0
  122. data/examples/webshops/amazon/002-8212888-3924065_files/tagline-adwidget.gif +0 -0
  123. data/examples/webshops/amazon/002-8212888-3924065_files/topnav-cart.gif +0 -0
  124. data/examples/webshops/amazon/002-8212888-3924065_files/transparent-pixel.gif +0 -0
  125. data/examples/webshops/amazon/002-8212888-3924065_files/transparent-pixel_002.gif +0 -0
  126. data/examples/webshops/amazon/test.rb +27 -0
  127. data/examples/webshops/amazon-online/test.rb +34 -0
  128. data/examples/webshops/barnes_and_noble/test.rb +32 -0
  129. data/examples/webshops/barnes_offline/barnes_and_noble.html +115 -0
  130. data/examples/webshops/barnes_offline/barnes_and_noble_files/10964080.gif +0 -0
  131. data/examples/webshops/barnes_offline/barnes_and_noble_files/10999676.gif +0 -0
  132. data/examples/webshops/barnes_offline/barnes_and_noble_files/11018492.gif +0 -0
  133. data/examples/webshops/barnes_offline/barnes_and_noble_files/11656534.gif +0 -0
  134. data/examples/webshops/barnes_offline/barnes_and_noble_files/11985045.gif +0 -0
  135. data/examples/webshops/barnes_offline/barnes_and_noble_files/12052378.gif +0 -0
  136. data/examples/webshops/barnes_offline/barnes_and_noble_files/12138286.gif +0 -0
  137. data/examples/webshops/barnes_offline/barnes_and_noble_files/12533212.gif +0 -0
  138. data/examples/webshops/barnes_offline/barnes_and_noble_files/12533268.gif +0 -0
  139. data/examples/webshops/barnes_offline/barnes_and_noble_files/9583392.gif +0 -0
  140. data/examples/webshops/barnes_offline/barnes_and_noble_files/SearchProduct.css +626 -0
  141. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin3_gtpointup.gif +0 -0
  142. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_aboutshipping.gif +0 -0
  143. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_account.gif +0 -0
  144. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_colon.gif +0 -0
  145. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_giftreminder.gif +0 -0
  146. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_help.gif +0 -0
  147. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_orderstatus.gif +0 -0
  148. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_wishlist.gif +0 -0
  149. data/examples/webshops/barnes_offline/barnes_and_noble_files/bg.gif +0 -0
  150. data/examples/webshops/barnes_offline/barnes_and_noble_files/btnGoGrn.gif +0 -0
  151. data/examples/webshops/barnes_offline/barnes_and_noble_files/cleardot.gif +0 -0
  152. data/examples/webshops/barnes_offline/barnes_and_noble_files/cleardot_002.gif +0 -0
  153. data/examples/webshops/barnes_offline/barnes_and_noble_files/dot4.gif +0 -0
  154. data/examples/webshops/barnes_offline/barnes_and_noble_files/dotGold20.gif +0 -0
  155. data/examples/webshops/barnes_offline/barnes_and_noble_files/hdCantFind.gif +0 -0
  156. data/examples/webshops/barnes_offline/barnes_and_noble_files/hdSearchResults.gif +0 -0
  157. data/examples/webshops/barnes_offline/barnes_and_noble_files/hgg_tab_home_cold.gif +0 -0
  158. data/examples/webshops/barnes_offline/barnes_and_noble_files/hgg_tab_toy_cold.gif +0 -0
  159. data/examples/webshops/barnes_offline/barnes_and_noble_files/iframeKMP.js +172 -0
  160. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2.html +25 -0
  161. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/070226_mc_lnav_search.gif +0 -0
  162. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/XmlUtil.js +199 -0
  163. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/XslStyleSheet.js +1 -0
  164. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/kmp_gen.css +81 -0
  165. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/product-preview-core.js +337 -0
  166. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/product-preview.css +36 -0
  167. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/productpreview.js +11 -0
  168. data/examples/webshops/barnes_offline/barnes_and_noble_files/linePagination.gif +0 -0
  169. data/examples/webshops/barnes_offline/barnes_and_noble_files/logo_bn05.gif +0 -0
  170. data/examples/webshops/barnes_offline/barnes_and_noble_files/navbar.js +34 -0
  171. data/examples/webshops/barnes_offline/barnes_and_noble_files/navbar_06.css +136 -0
  172. data/examples/webshops/barnes_offline/barnes_and_noble_files/popup_open.js +116 -0
  173. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch3_vline_dots.gif +0 -0
  174. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch4_search.gif +0 -0
  175. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch_AdvSearch.jpg +0 -0
  176. data/examples/webshops/barnes_offline/barnes_and_noble_files/subnav_colon.gif +0 -0
  177. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_Bookclubs_cold.gif +0 -0
  178. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_bnjr_cold.gif +0 -0
  179. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_books_hot.gif +0 -0
  180. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_dvd_cold.gif +0 -0
  181. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_giftcards_cold.gif +0 -0
  182. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_home_cold.gif +0 -0
  183. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_member_cc_cold.gif +0 -0
  184. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_music_cold.gif +0 -0
  185. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_pipe.gif +0 -0
  186. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_textbooksonly_cold.gif +0 -0
  187. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_usedoop_cold.gif +0 -0
  188. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_videogames_cold.gif +0 -0
  189. data/examples/webshops/barnes_offline/barnes_and_noble_files/toppromo3_rule.gif +0 -0
  190. data/examples/webshops/barnes_offline/barnes_and_noble_files/toppromo_fastfree05.gif +0 -0
  191. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_btn_checkout.gif +0 -0
  192. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_icon_cart.gif +0 -0
  193. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_topbot_rule.gif +0 -0
  194. data/examples/webshops/barnes_offline/barnes_and_noble_files/visualcart_prodid.js +401 -0
  195. data/examples/webshops/barnes_offline/test.rb +30 -0
  196. data/examples/webshops/buydig/2_files/03AA1BB9089A4A6A92CF23F280D664EB.jpg +0 -0
  197. data/examples/webshops/buydig/2_files/1008.gif +0 -0
  198. data/examples/webshops/buydig/2_files/1013.gif +0 -0
  199. data/examples/webshops/buydig/2_files/1020.gif +0 -0
  200. data/examples/webshops/buydig/2_files/106CF2FB84B446518397517C3E6D5AD8.jpg +0 -0
  201. data/examples/webshops/buydig/2_files/13-www.gif +0 -0
  202. data/examples/webshops/buydig/2_files/1E9BB2E56AB145FC9D6EF952703AF476.jpg +0 -0
  203. data/examples/webshops/buydig/2_files/1FCDFBE85CDB4D429EC2C8CB24D20457.jpg +0 -0
  204. data/examples/webshops/buydig/2_files/1pix.gif +0 -0
  205. data/examples/webshops/buydig/2_files/2014.gif +0 -0
  206. data/examples/webshops/buydig/2_files/2089.gif +0 -0
  207. data/examples/webshops/buydig/2_files/24992_medal.gif +0 -0
  208. data/examples/webshops/buydig/2_files/24BBCBA1397F4DDCBBBBE8456D6D6E5B.jpg +0 -0
  209. data/examples/webshops/buydig/2_files/281F8A6019B140F38DFD45EB7B69B0FB.jpg +0 -0
  210. data/examples/webshops/buydig/2_files/2975F866CB2149F7ACBC559C8E24E304.jpg +0 -0
  211. data/examples/webshops/buydig/2_files/316FC9256DC9460ABC3C5ECAF6C60286.jpg +0 -0
  212. data/examples/webshops/buydig/2_files/50569327D8B94252B95E449AE470E505.jpg +0 -0
  213. data/examples/webshops/buydig/2_files/519CDAB404FA4543B76B5F281468ACBF.jpg +0 -0
  214. data/examples/webshops/buydig/2_files/57D6146419A647BA89C96AF0B5CAB03C.jpg +0 -0
  215. data/examples/webshops/buydig/2_files/58E3F988E184448B8C0A59874AE123A8.jpg +0 -0
  216. data/examples/webshops/buydig/2_files/5E5B10197A4E4C9B9ECCD6309DBE4C54.jpg +0 -0
  217. data/examples/webshops/buydig/2_files/609A249177D04065B37B9161CB0BC92D.jpg +0 -0
  218. data/examples/webshops/buydig/2_files/676CEE8E53C2445982E991871B4DF613.jpg +0 -0
  219. data/examples/webshops/buydig/2_files/712BA08FAB524A31A76ABB9E2009FF8E.jpg +0 -0
  220. data/examples/webshops/buydig/2_files/734BD08D7A5049339393166491D09D21.jpg +0 -0
  221. data/examples/webshops/buydig/2_files/751E72B7003343248497FE6905F80787.jpg +0 -0
  222. data/examples/webshops/buydig/2_files/76493D4F02F14EF7B5886510604C7BB4.jpg +0 -0
  223. data/examples/webshops/buydig/2_files/79521E251278486DB29529C60C9D012A.jpg +0 -0
  224. data/examples/webshops/buydig/2_files/9C9AF82AC3B54BDC8C705278B50FDFD6.jpg +0 -0
  225. data/examples/webshops/buydig/2_files/BC3FD8307B9948FDB7EEF156D8629C37.jpg +0 -0
  226. data/examples/webshops/buydig/2_files/C0DD4574765047D1836F505E69DC8AE5.jpg +0 -0
  227. data/examples/webshops/buydig/2_files/C143F48515274A44B04F4B3E46306BD2.jpg +0 -0
  228. data/examples/webshops/buydig/2_files/C6B02E88F729464699DB275D140F4563.jpg +0 -0
  229. data/examples/webshops/buydig/2_files/CE334D6206DB4FA9AFDF339AEF0AF50F.jpg +0 -0
  230. data/examples/webshops/buydig/2_files/D66AE0DC865A4021AB300ED3A0C4CD11.jpg +0 -0
  231. data/examples/webshops/buydig/2_files/DEA2EC2093DC474D96B651068576DAE5.jpg +0 -0
  232. data/examples/webshops/buydig/2_files/F547677D83844042BF13A4BE6523BB50.jpg +0 -0
  233. data/examples/webshops/buydig/2_files/Rbbbonlineseal.gif +0 -0
  234. data/examples/webshops/buydig/2_files/TopSellers_bottom.gif +0 -0
  235. data/examples/webshops/buydig/2_files/TopSellers_ttl.gif +0 -0
  236. data/examples/webshops/buydig/2_files/addToFavorites_ttl.gif +0 -0
  237. data/examples/webshops/buydig/2_files/banner_CorporateSales.gif +0 -0
  238. data/examples/webshops/buydig/2_files/banner_Shipping.gif +0 -0
  239. data/examples/webshops/buydig/2_files/bizratehonoree.gif +0 -0
  240. data/examples/webshops/buydig/2_files/btn_submit.gif +0 -0
  241. data/examples/webshops/buydig/2_files/checkFlash.js +110 -0
  242. data/examples/webshops/buydig/2_files/checkFlash2.js +109 -0
  243. data/examples/webshops/buydig/2_files/cnetcertified.gif +0 -0
  244. data/examples/webshops/buydig/2_files/credPriceGrabber.gif +0 -0
  245. data/examples/webshops/buydig/2_files/credShopping.gif +0 -0
  246. data/examples/webshops/buydig/2_files/credential_paypal.gif +0 -0
  247. data/examples/webshops/buydig/2_files/credentials.gif +0 -0
  248. data/examples/webshops/buydig/2_files/dealtime.gif +0 -0
  249. data/examples/webshops/buydig/2_files/dvxstyle.css +754 -0
  250. data/examples/webshops/buydig/2_files/footer_021306_1_v1.gif +0 -0
  251. data/examples/webshops/buydig/2_files/func.js +132 -0
  252. data/examples/webshops/buydig/2_files/getseal +1 -0
  253. data/examples/webshops/buydig/2_files/help.gif +0 -0
  254. data/examples/webshops/buydig/2_files/home.gif +0 -0
  255. data/examples/webshops/buydig/2_files/java.js +155 -0
  256. data/examples/webshops/buydig/2_files/leftnv_help.gif +0 -0
  257. data/examples/webshops/buydig/2_files/logo.gif +0 -0
  258. data/examples/webshops/buydig/2_files/logo2.gif +0 -0
  259. data/examples/webshops/buydig/2_files/logo3.gif +0 -0
  260. data/examples/webshops/buydig/2_files/main.js +227 -0
  261. data/examples/webshops/buydig/2_files/mastercard_secured.gif +0 -0
  262. data/examples/webshops/buydig/2_files/newsBox_bkg.jpg +0 -0
  263. data/examples/webshops/buydig/2_files/newsBox_bottom.jpg +0 -0
  264. data/examples/webshops/buydig/2_files/newsBox_text.gif +0 -0
  265. data/examples/webshops/buydig/2_files/newsBox_ttl.jpg +0 -0
  266. data/examples/webshops/buydig/2_files/noimage75.gif +0 -0
  267. data/examples/webshops/buydig/2_files/orangeleftfrmbtm.gif +0 -0
  268. data/examples/webshops/buydig/2_files/pixel153.gif +0 -0
  269. data/examples/webshops/buydig/2_files/rightnv_bottom.gif +0 -0
  270. data/examples/webshops/buydig/2_files/search_btn_off.gif +0 -0
  271. data/examples/webshops/buydig/2_files/search_c1.gif +0 -0
  272. data/examples/webshops/buydig/2_files/search_c2.gif +0 -0
  273. data/examples/webshops/buydig/2_files/search_c3.gif +0 -0
  274. data/examples/webshops/buydig/2_files/search_c4.gif +0 -0
  275. data/examples/webshops/buydig/2_files/search_down.gif +0 -0
  276. data/examples/webshops/buydig/2_files/search_left.gif +0 -0
  277. data/examples/webshops/buydig/2_files/search_right.gif +0 -0
  278. data/examples/webshops/buydig/2_files/search_top.gif +0 -0
  279. data/examples/webshops/buydig/2_files/siteLinks_bottom.gif +0 -0
  280. data/examples/webshops/buydig/2_files/siteLinks_bullet.gif +0 -0
  281. data/examples/webshops/buydig/2_files/siteLinks_ttl.gif +0 -0
  282. data/examples/webshops/buydig/2_files/spacer.gif +0 -0
  283. data/examples/webshops/buydig/2_files/style.js +45 -0
  284. data/examples/webshops/buydig/2_files/styles.html +33 -0
  285. data/examples/webshops/buydig/2_files/track_orders.jpg +0 -0
  286. data/examples/webshops/buydig/2_files/urchin +534 -0
  287. data/examples/webshops/buydig/2_files/verified_by_visa.gif +0 -0
  288. data/examples/webshops/buydig/2_files/welcome.gif +0 -0
  289. data/examples/webshops/buydig/2_files/welcome_ttl.gif +0 -0
  290. data/examples/webshops/buydig/2_files/yahoologo.gif +0 -0
  291. data/examples/webshops/buydig/input.html +1194 -0
  292. data/examples/webshops/buydig/test.rb +31 -0
  293. data/examples/webshops/ebay/test.rb +32 -0
  294. data/examples/webshops/finewines_offline/_finewines.html +1739 -0
  295. data/examples/webshops/finewines_offline/_finewines_cut.html +371 -0
  296. data/examples/webshops/finewines_offline/_finewines_files/011064.jpg +0 -0
  297. data/examples/webshops/finewines_offline/_finewines_files/012674.jpg +0 -0
  298. data/examples/webshops/finewines_offline/_finewines_files/013268.jpg +0 -0
  299. data/examples/webshops/finewines_offline/_finewines_files/013300.jpg +0 -0
  300. data/examples/webshops/finewines_offline/_finewines_files/013409.jpg +0 -0
  301. data/examples/webshops/finewines_offline/_finewines_files/014340.jpg +0 -0
  302. data/examples/webshops/finewines_offline/_finewines_files/015073.jpg +0 -0
  303. data/examples/webshops/finewines_offline/_finewines_files/015255.jpg +0 -0
  304. data/examples/webshops/finewines_offline/_finewines_files/015479.jpg +0 -0
  305. data/examples/webshops/finewines_offline/_finewines_files/015487.jpg +0 -0
  306. data/examples/webshops/finewines_offline/_finewines_files/017038.jpg +0 -0
  307. data/examples/webshops/finewines_offline/_finewines_files/017129.jpg +0 -0
  308. data/examples/webshops/finewines_offline/_finewines_files/017145.jpg +0 -0
  309. data/examples/webshops/finewines_offline/_finewines_files/017152.jpg +0 -0
  310. data/examples/webshops/finewines_offline/_finewines_files/017285.jpg +0 -0
  311. data/examples/webshops/finewines_offline/_finewines_files/017392.jpg +0 -0
  312. data/examples/webshops/finewines_offline/_finewines_files/017400.jpg +0 -0
  313. data/examples/webshops/finewines_offline/_finewines_files/019778.jpg +0 -0
  314. data/examples/webshops/finewines_offline/_finewines_files/019786.jpg +0 -0
  315. data/examples/webshops/finewines_offline/_finewines_files/020503.jpg +0 -0
  316. data/examples/webshops/finewines_offline/_finewines_files/021253.jpg +0 -0
  317. data/examples/webshops/finewines_offline/_finewines_files/021279.jpg +0 -0
  318. data/examples/webshops/finewines_offline/_finewines_files/021337.jpg +0 -0
  319. data/examples/webshops/finewines_offline/_finewines_files/021352.jpg +0 -0
  320. data/examples/webshops/finewines_offline/_finewines_files/023002.jpg +0 -0
  321. data/examples/webshops/finewines_offline/_finewines_files/023135.jpg +0 -0
  322. data/examples/webshops/finewines_offline/_finewines_files/023143.jpg +0 -0
  323. data/examples/webshops/finewines_offline/_finewines_files/023788.jpg +0 -0
  324. data/examples/webshops/finewines_offline/_finewines_files/024166.jpg +0 -0
  325. data/examples/webshops/finewines_offline/_finewines_files/024182.jpg +0 -0
  326. data/examples/webshops/finewines_offline/_finewines_files/024216.jpg +0 -0
  327. data/examples/webshops/finewines_offline/_finewines_files/027268.jpg +0 -0
  328. data/examples/webshops/finewines_offline/_finewines_files/027516.jpg +0 -0
  329. data/examples/webshops/finewines_offline/_finewines_files/027862.jpg +0 -0
  330. data/examples/webshops/finewines_offline/_finewines_files/028118.jpg +0 -0
  331. data/examples/webshops/finewines_offline/_finewines_files/028936.jpg +0 -0
  332. data/examples/webshops/finewines_offline/_finewines_files/033894.jpg +0 -0
  333. data/examples/webshops/finewines_offline/_finewines_files/033902.jpg +0 -0
  334. data/examples/webshops/finewines_offline/_finewines_files/033910.jpg +0 -0
  335. data/examples/webshops/finewines_offline/_finewines_files/033936.jpg +0 -0
  336. data/examples/webshops/finewines_offline/_finewines_files/033944.jpg +0 -0
  337. data/examples/webshops/finewines_offline/_finewines_files/033951.jpg +0 -0
  338. data/examples/webshops/finewines_offline/_finewines_files/034553.jpg +0 -0
  339. data/examples/webshops/finewines_offline/_finewines_files/034561.jpg +0 -0
  340. data/examples/webshops/finewines_offline/_finewines_files/232439.jpg +0 -0
  341. data/examples/webshops/finewines_offline/_finewines_files/237834.jpg +0 -0
  342. data/examples/webshops/finewines_offline/_finewines_files/268359.jpg +0 -0
  343. data/examples/webshops/finewines_offline/_finewines_files/289082.jpg +0 -0
  344. data/examples/webshops/finewines_offline/_finewines_files/331603.jpg +0 -0
  345. data/examples/webshops/finewines_offline/_finewines_files/369686.jpg +0 -0
  346. data/examples/webshops/finewines_offline/_finewines_files/420257.jpg +0 -0
  347. data/examples/webshops/finewines_offline/_finewines_files/422014.jpg +0 -0
  348. data/examples/webshops/finewines_offline/_finewines_files/460410.jpg +0 -0
  349. data/examples/webshops/finewines_offline/_finewines_files/480533.jpg +0 -0
  350. data/examples/webshops/finewines_offline/_finewines_files/556795.jpg +0 -0
  351. data/examples/webshops/finewines_offline/_finewines_files/597054.jpg +0 -0
  352. data/examples/webshops/finewines_offline/_finewines_files/650606.jpg +0 -0
  353. data/examples/webshops/finewines_offline/_finewines_files/652628.jpg +0 -0
  354. data/examples/webshops/finewines_offline/_finewines_files/653790.jpg +0 -0
  355. data/examples/webshops/finewines_offline/_finewines_files/658450.jpg +0 -0
  356. data/examples/webshops/finewines_offline/_finewines_files/660027.jpg +0 -0
  357. data/examples/webshops/finewines_offline/_finewines_files/660951.jpg +0 -0
  358. data/examples/webshops/finewines_offline/_finewines_files/684514.jpg +0 -0
  359. data/examples/webshops/finewines_offline/_finewines_files/685131.jpg +0 -0
  360. data/examples/webshops/finewines_offline/_finewines_files/686865.jpg +0 -0
  361. data/examples/webshops/finewines_offline/_finewines_files/699330.jpg +0 -0
  362. data/examples/webshops/finewines_offline/_finewines_files/703017.jpg +0 -0
  363. data/examples/webshops/finewines_offline/_finewines_files/703140.jpg +0 -0
  364. data/examples/webshops/finewines_offline/_finewines_files/703850.jpg +0 -0
  365. data/examples/webshops/finewines_offline/_finewines_files/717306.jpg +0 -0
  366. data/examples/webshops/finewines_offline/_finewines_files/900274.jpg +0 -0
  367. data/examples/webshops/finewines_offline/_finewines_files/938225.jpg +0 -0
  368. data/examples/webshops/finewines_offline/_finewines_files/947440.jpg +0 -0
  369. data/examples/webshops/finewines_offline/_finewines_files/951319.jpg +0 -0
  370. data/examples/webshops/finewines_offline/_finewines_files/967893.jpg +0 -0
  371. data/examples/webshops/finewines_offline/_finewines_files/981407.jpg +0 -0
  372. data/examples/webshops/finewines_offline/_finewines_files/981613.jpg +0 -0
  373. data/examples/webshops/finewines_offline/_finewines_files/982421.jpg +0 -0
  374. data/examples/webshops/finewines_offline/_finewines_files/985598.jpg +0 -0
  375. data/examples/webshops/finewines_offline/_finewines_files/986737.jpg +0 -0
  376. data/examples/webshops/finewines_offline/_finewines_files/987503.jpg +0 -0
  377. data/examples/webshops/finewines_offline/_finewines_files/992800.jpg +0 -0
  378. data/examples/webshops/finewines_offline/_finewines_files/VintageslogoEN.gif +0 -0
  379. data/examples/webshops/finewines_offline/_finewines_files/blanc-up.gif +0 -0
  380. data/examples/webshops/finewines_offline/_finewines_files/btn_vintages_latest.gif +0 -0
  381. data/examples/webshops/finewines_offline/_finewines_files/cc_en.gif +0 -0
  382. data/examples/webshops/finewines_offline/_finewines_files/common.js +860 -0
  383. data/examples/webshops/finewines_offline/_finewines_files/drink.gif +0 -0
  384. data/examples/webshops/finewines_offline/_finewines_files/drinkhold.gif +0 -0
  385. data/examples/webshops/finewines_offline/_finewines_files/ec_en.gif +0 -0
  386. data/examples/webshops/finewines_offline/_finewines_files/ev_en.gif +0 -0
  387. data/examples/webshops/finewines_offline/_finewines_files/hold.gif +0 -0
  388. data/examples/webshops/finewines_offline/_finewines_files/index-wines-features.jpg +0 -0
  389. data/examples/webshops/finewines_offline/_finewines_files/indexSTYLE.css +398 -0
  390. data/examples/webshops/finewines_offline/_finewines_files/keyword_search.gif +0 -0
  391. data/examples/webshops/finewines_offline/_finewines_files/mm_menu.js +1 -0
  392. data/examples/webshops/finewines_offline/_finewines_files/nr_en.gif +0 -0
  393. data/examples/webshops/finewines_offline/_finewines_files/ontario_en.gif +0 -0
  394. data/examples/webshops/finewines_offline/_finewines_files/popup.js +81 -0
  395. data/examples/webshops/finewines_offline/_finewines_files/releases_nav.js +229 -0
  396. data/examples/webshops/finewines_offline/_finewines_files/so_en.gif +0 -0
  397. data/examples/webshops/finewines_offline/_finewines_files/spacer.gif +0 -0
  398. data/examples/webshops/finewines_offline/_finewines_files/top.gif +0 -0
  399. data/examples/webshops/finewines_offline/_finewines_files/urchin.js +576 -0
  400. data/examples/webshops/finewines_offline/_finewines_files/wom_en.gif +0 -0
  401. data/examples/webshops/finewines_offline/test.rb +30 -0
  402. data/examples/webshops/us1camera/1_files/1pix.gif +0 -0
  403. data/examples/webshops/us1camera/1_files/1pix_002.gif +0 -0
  404. data/examples/webshops/us1camera/1_files/CnetCertified.gif +0 -0
  405. data/examples/webshops/us1camera/1_files/CyberSource.gif +0 -0
  406. data/examples/webshops/us1camera/1_files/Images50.gif +0 -0
  407. data/examples/webshops/us1camera/1_files/Images50_002.gif +0 -0
  408. data/examples/webshops/us1camera/1_files/Images50_003.gif +0 -0
  409. data/examples/webshops/us1camera/1_files/Images50_004.gif +0 -0
  410. data/examples/webshops/us1camera/1_files/Images50_005.gif +0 -0
  411. data/examples/webshops/us1camera/1_files/Images50_006.gif +0 -0
  412. data/examples/webshops/us1camera/1_files/PriceGrabber.gif +0 -0
  413. data/examples/webshops/us1camera/1_files/QSearch.gif +0 -0
  414. data/examples/webshops/us1camera/1_files/ban-m.jpg +0 -0
  415. data/examples/webshops/us1camera/1_files/banner1.bin +0 -0
  416. data/examples/webshops/us1camera/1_files/banner3.bin +0 -0
  417. data/examples/webshops/us1camera/1_files/block1.jpg +0 -0
  418. data/examples/webshops/us1camera/1_files/block2.jpg +0 -0
  419. data/examples/webshops/us1camera/1_files/block3.jpg +0 -0
  420. data/examples/webshops/us1camera/1_files/block4.jpg +0 -0
  421. data/examples/webshops/us1camera/1_files/block5.jpg +0 -0
  422. data/examples/webshops/us1camera/1_files/block6.jpg +0 -0
  423. data/examples/webshops/us1camera/1_files/bos.js +280 -0
  424. data/examples/webshops/us1camera/1_files/box1.jpg +0 -0
  425. data/examples/webshops/us1camera/1_files/box2.jpg +0 -0
  426. data/examples/webshops/us1camera/1_files/box3.jpg +0 -0
  427. data/examples/webshops/us1camera/1_files/box4.jpg +0 -0
  428. data/examples/webshops/us1camera/1_files/dot.jpg +0 -0
  429. data/examples/webshops/us1camera/1_files/eDevix.gif +0 -0
  430. data/examples/webshops/us1camera/1_files/electronics1.jpg +0 -0
  431. data/examples/webshops/us1camera/1_files/getseal +1 -0
  432. data/examples/webshops/us1camera/1_files/pride.jpg +0 -0
  433. data/examples/webshops/us1camera/1_files/search.jpg +0 -0
  434. data/examples/webshops/us1camera/1_files/sidebutton.jpg +0 -0
  435. data/examples/webshops/us1camera/1_files/sslroilogic.js +49 -0
  436. data/examples/webshops/us1camera/1_files/style.css +1 -0
  437. data/examples/webshops/us1camera/1_files/tl.html +2 -0
  438. data/examples/webshops/us1camera/input.html +548 -0
  439. data/examples/webshops/us1camera/test.rb +37 -0
  440. data/lib/scrubyt/core/navigation/agents/firewatir.rb +285 -0
  441. data/lib/scrubyt/core/navigation/agents/mechanize.rb +315 -0
  442. data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
  443. data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
  444. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  445. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  446. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  447. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  448. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  449. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  450. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  451. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  452. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  453. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  454. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  455. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  456. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  457. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  458. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  459. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  460. data/lib/scrubyt/core/shared/extractor.rb +171 -0
  461. data/lib/scrubyt/logging.rb +154 -0
  462. data/lib/scrubyt/output/post_processor.rb +139 -0
  463. data/lib/scrubyt/output/result.rb +44 -0
  464. data/lib/scrubyt/output/result_dumper.rb +154 -0
  465. data/lib/scrubyt/output/result_node.rb +145 -0
  466. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  467. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  468. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  469. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  470. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  471. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  472. data/lib/scrubyt.rb +53 -0
  473. data/pkg/scrubyt-0.4.31.gem +0 -0
  474. data/resources/allison/LICENSE +184 -0
  475. data/resources/allison/README +37 -0
  476. data/resources/allison/allison.css +301 -0
  477. data/resources/allison/allison.gif +0 -0
  478. data/resources/allison/allison.js +307 -0
  479. data/resources/allison/allison.rb +287 -0
  480. data/resources/allison/cache/BODY +588 -0
  481. data/resources/allison/cache/CLASS_INDEX +4 -0
  482. data/resources/allison/cache/CLASS_PAGE +1 -0
  483. data/resources/allison/cache/FILE_INDEX +4 -0
  484. data/resources/allison/cache/FILE_PAGE +1 -0
  485. data/resources/allison/cache/FONTS +1 -0
  486. data/resources/allison/cache/FR_INDEX_BODY +1 -0
  487. data/resources/allison/cache/IMGPATH +1 -0
  488. data/resources/allison/cache/INDEX +1 -0
  489. data/resources/allison/cache/JAVASCRIPT +307 -0
  490. data/resources/allison/cache/METHOD_INDEX +4 -0
  491. data/resources/allison/cache/METHOD_LIST +1 -0
  492. data/resources/allison/cache/SRC_PAGE +1 -0
  493. data/resources/allison/cache/STYLE +323 -0
  494. data/resources/allison/cache/URL +1 -0
  495. data/scrubyt.gemspec +609 -0
  496. data/test/blackbox_test.rb +60 -0
  497. data/test/blackbox_tests/basic/multi_root.expected.xml +8 -0
  498. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  499. data/test/blackbox_tests/basic/simple.expected.xml +5 -0
  500. data/test/blackbox_tests/basic/simple.rb +5 -0
  501. data/test/blackbox_tests/basic/three_divs.html +12 -0
  502. data/test/blackbox_tests/detail_page/detail_page_1.html +7 -0
  503. data/test/blackbox_tests/detail_page/detail_page_2.html +7 -0
  504. data/test/blackbox_tests/detail_page/main_page_1.html +5 -0
  505. data/test/blackbox_tests/detail_page/main_page_2.html +6 -0
  506. data/test/blackbox_tests/detail_page/one_detail_page.expected.xml +7 -0
  507. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  508. data/test/blackbox_tests/detail_page/two_detail_pages.expected.xml +12 -0
  509. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  510. data/test/blackbox_tests/next_page/next_page_link.expected.xml +11 -0
  511. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  512. data/test/blackbox_tests/next_page/page_1.html +11 -0
  513. data/test/blackbox_tests/next_page/page_2.html +11 -0
  514. data/test/blackbox_tests/next_page/page_3.html +7 -0
  515. data/test/blackbox_tests/next_page/page_list_links.expected.xml +11 -0
  516. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  517. data/test/blackbox_tests/next_page/page_list_links.tofix +7 -0
  518. data/todo/backlog.txt +73 -0
  519. data/todo/scenario_ideas.txt +19 -0
  520. metadata +637 -0
@@ -0,0 +1,171 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Driving the whole extraction process</tt>
4
+ #
5
+ #Extractor is a performer class - it gets an extractor definition and carries
6
+ #out the actions and evaluates the wrappers sequentially.
7
+ #
8
+ #Originally also the navigation actions were here, but since the class got too
9
+ #big, they were factored out to an own class, NavigationAction.
10
+ class Extractor
11
+ include FetchAction
12
+
13
+ attr_accessor :result, :evaluating_extractor_definition, :mode, :root_patterns, :next_page_pattern#, :hpricot_doc, :current_doc_url
14
+
15
+ #The definition of the extractor is passed through this method
16
+ def self.define(mode=nil, &extractor_definition)
17
+ if mode.is_a?(Hash)
18
+ if mode[:agent] == :firefox
19
+ FetchAction.class_eval do
20
+ include Navigation::Firewatir
21
+ end
22
+ else
23
+ FetchAction.class_eval do
24
+ include Navigation::Mechanize
25
+ end
26
+ end
27
+ else
28
+ FetchAction.class_eval do
29
+ include Navigation::Mechanize
30
+ end
31
+ end
32
+ extractor = self.new(mode, extractor_definition)
33
+ extractor.result
34
+ end
35
+
36
+ def self.load(filename)
37
+ define(&eval(IO.read(filename)))
38
+ end
39
+
40
+ def initialize(mode, extractor_definition)
41
+ @mode = mode
42
+ @root_patterns = []
43
+ @next_page_pattern = nil
44
+ # @hpricot_doc = nil
45
+ # @hpricot_doc_url = nil
46
+ @evaluating_extractor_definition = false
47
+ @next_page_list = []
48
+ @processed_pages = []
49
+
50
+ backtrace = SharedUtils.get_backtrace
51
+ parts = backtrace[1].split(':')
52
+ source_file = parts[0]
53
+
54
+ Scrubyt.log :MODE, mode == :production ? 'Production' : 'Learning'
55
+
56
+ @evaluating_extractor_definition = true
57
+ context = Object.new
58
+ context.extend NavigationActions
59
+ context.instance_eval do
60
+ def extractor=(value)
61
+ @extractor = value
62
+ end
63
+
64
+ def next_page(*args)
65
+ @extractor.next_page_pattern = Scrubyt::Pattern.new('next_page', args, @extractor)
66
+ end
67
+
68
+ def method_missing(method_name, *args, &block)
69
+ root_pattern = Scrubyt::Pattern.new(method_name.to_s, args, @extractor, nil, &block)
70
+ @extractor.root_patterns << root_pattern
71
+ root_pattern
72
+ end
73
+ end
74
+ FetchAction.extractor = self
75
+ context.extractor = self
76
+ context.instance_eval(&extractor_definition)
77
+ @evaluating_extractor_definition = false
78
+
79
+ if @root_patterns.empty?
80
+ # TODO: this should be an exception
81
+ Scrubyt.log :ERROR, 'No extractor defined, exiting...'
82
+ exit
83
+ end
84
+
85
+ #Once all is set up, evaluate the extractor from the root pattern!
86
+ root_results = evaluate_extractor
87
+ FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
88
+
89
+
90
+ @result = ScrubytResult.new('root')
91
+ @result.push(*@root_results)
92
+ @result.root_patterns = @root_patterns
93
+ @result.source_file = source_file
94
+ @result.source_proc = extractor_definition
95
+
96
+ #Return the root pattern
97
+ Scrubyt.log :INFO, 'Extraction finished succesfully!'
98
+ end
99
+
100
+ def get_hpricot_doc
101
+ FetchAction.get_hpricot_doc
102
+ end
103
+
104
+ def get_current_doc_url
105
+ FetchAction.get_current_doc_url
106
+ end
107
+
108
+ def get_detail_pattern_relations
109
+ @detail_pattern_relations
110
+ end
111
+
112
+ def get_mode
113
+ @mode
114
+ end
115
+
116
+ def get_original_host_name
117
+ @original_host_name
118
+ end
119
+
120
+ def add_to_next_page_list(result_node)
121
+ if result_node.result.is_a? Hpricot::Elem
122
+ node = XPathUtils.find_nearest_node_with_attribute(result_node.result, 'href')
123
+ return if node == nil || node.attributes['href'] == nil
124
+ href = node.attributes['href'].gsub('&amp;') {'&'}
125
+ elsif result_node.result.is_a? String
126
+ href = result_node.result
127
+ end
128
+ url = href #TODO need absolute address here 1/4
129
+ @next_page_list << url
130
+ end
131
+
132
+ def evaluate_extractor
133
+ @root_results ||= []
134
+ current_page_count = 1
135
+ catch :quit_next_page_loop do
136
+ loop do
137
+ url = get_current_doc_url #TODO need absolute address here 2/4
138
+ @processed_pages << url
139
+ @root_patterns.each do |root_pattern|
140
+ @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
141
+ end
142
+
143
+ while @processed_pages.include? url #TODO need absolute address here 3/4
144
+ if !@next_page_pattern.nil?
145
+ throw :quit_next_page_loop if @next_page_pattern.options[:limit] == current_page_count
146
+ throw :quit_next_page_loop unless @next_page_pattern.filters[0].generate_XPath_for_example(true)
147
+ xpath = @next_page_pattern.filters[0].xpath
148
+ node = (get_hpricot_doc/xpath).map.last
149
+ node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
150
+ throw :quit_next_page_loop if node == nil || node.attributes['href'] == nil
151
+ href = node.attributes['href'].gsub('&amp;') {'&'}
152
+ throw :quit_next_page_loop if href == nil
153
+ url = href #TODO need absolute address here 4/4
154
+ else
155
+ throw :quit_next_page_loop if @next_page_list.empty?
156
+ url = @next_page_list.pop
157
+ end
158
+ end
159
+
160
+ restore_host_name
161
+ FetchAction.fetch(url)
162
+
163
+ current_page_count += 1
164
+ end
165
+ end
166
+ @root_patterns = []
167
+ @root_results
168
+ end
169
+
170
+ end
171
+ end
@@ -0,0 +1,154 @@
1
+ #
2
+ # TODO: if multiline messages aren't needed, then remove them.
3
+ #
4
+ # TODO: switch to the conventional Ruby logger interface,
5
+ # or create an adapter to it. If the former, then decided what to
6
+ # do with the unit tests.
7
+ #
8
+
9
+ module Scrubyt
10
+ # Logging is disabled by default. It can be enabled as follows:
11
+ #
12
+ # Scrubyt.logger = Scrubyt::Logger.new # logs *all* messages to STDERR
13
+ #
14
+ def self.logger=(logger)
15
+ @logger = logger
16
+ end
17
+
18
+ # Simple logger implementation, based on Scrubyt's original logging style.
19
+ # Messages will be sent to STDERR. Logging can be limited to certain message
20
+ # levels by specifying them on initialization, e.g.
21
+ #
22
+ # Scrubyt::Logger.new(:ACTION, :ERROR) # will only log action/error messages
23
+ #
24
+ class Logger
25
+ class Message
26
+ def initialize(level, text)
27
+ @level, @text = level.to_s, text.to_s
28
+ end
29
+
30
+ def to_s
31
+ prefix + @text
32
+ end
33
+
34
+ protected
35
+
36
+ def prefix
37
+ @prefix ||= "[#{@level}] "
38
+ end
39
+ end
40
+
41
+ class MultiLineMessage < Message
42
+ def initialize(level, lines)
43
+ super level, lines.shift
44
+
45
+ @lines = lines
46
+ end
47
+
48
+ def to_s
49
+ [ super, indented_lines ] * "\n"
50
+ end
51
+
52
+ private
53
+
54
+ def indented_lines
55
+ @lines.inject([]) { |lines, line| lines << indented(line) } * "\n"
56
+ end
57
+
58
+ def indented(line)
59
+ ' ' * prefix.length + line
60
+ end
61
+ end
62
+
63
+ def initialize(*levels)
64
+ @levels = levels
65
+ end
66
+
67
+ def log(level, message)
68
+ return unless logging?(level)
69
+
70
+ message_class = message.is_a?(Array) ? MultiLineMessage : Message
71
+
72
+ output_stream.puts message_class.new(level, message)
73
+ end
74
+
75
+ def output_stream
76
+ @output_stream || STDERR
77
+ end
78
+
79
+ attr_writer :output_stream
80
+
81
+ private
82
+
83
+ def logging?(level)
84
+ @levels.empty? || @levels.include?(level)
85
+ end
86
+ end
87
+
88
+ def self.log(level, message)
89
+ return if logger.nil?
90
+
91
+ logger.log(level, message)
92
+ end
93
+
94
+ private
95
+
96
+ def self.logger
97
+ @logger
98
+ end
99
+ end
100
+
101
+
102
+ if __FILE__ == $0 then
103
+
104
+ require 'test/unit'
105
+
106
+ class ScrubytLoggingTestCase < Test::Unit::TestCase
107
+ class FauxOutputStream < Array
108
+ def puts(object)
109
+ self << object.to_s
110
+ end
111
+ end
112
+
113
+ def setup_logger_with_faux_output_stream!(*logger_args)
114
+ @stream = FauxOutputStream.new
115
+ logger = Scrubyt::Logger.new(*logger_args)
116
+ logger.output_stream = @stream
117
+ Scrubyt.logger = logger
118
+ end
119
+
120
+ def test_that_logging_works_with_nil_logger
121
+ Scrubyt.logger = nil
122
+ assert_nothing_raised { Scrubyt.log(:ERROR, 'message') }
123
+ end
124
+
125
+ def test_simple_messages_are_output_correctly
126
+ setup_logger_with_faux_output_stream!
127
+
128
+ Scrubyt.log :ACTION, 'i just did something'
129
+
130
+ assert_equal 1, @stream.size
131
+ assert_equal '[ACTION] i just did something', @stream.first
132
+ end
133
+
134
+ def test_that_multiline_messages_are_output_correctly
135
+ setup_logger_with_faux_output_stream!
136
+
137
+ Scrubyt.log :ERROR, ['something bad happened', 'dear oh dear']
138
+
139
+ assert_equal 1, @stream.size
140
+ assert_equal "[ERROR] something bad happened\n dear oh dear", @stream.first
141
+ end
142
+
143
+ def test_that_loggers_can_be_limited_to_specfied_message_levels
144
+ setup_logger_with_faux_output_stream! :ERROR
145
+
146
+ Scrubyt.log :ACTION, 'i just did something'
147
+ Scrubyt.log :ERROR, 'something bad happened'
148
+
149
+ assert_equal 1, @stream.size
150
+ assert_equal '[ERROR] something bad happened', @stream.first
151
+ end
152
+ end
153
+
154
+ end
@@ -0,0 +1,139 @@
1
+ module Scrubyt
2
+
3
+ ########################################## NOT USED ANY MORE ##########################################
4
+ require 'set'
5
+ ##
6
+ #=<tt>Post processing results after the extraction</tt>
7
+ #Some things can not be carried out during evaluation - for example
8
+ #the ensure_presence_of_pattern constraint (since the evaluation is top
9
+ #to bottom, at a given point we don't know yet whether the currently
10
+ #evaluated pattern will have a child pattern or not) or removing unneeded
11
+ #results caused by evaluating multiple filters.
12
+ #
13
+ #The sole purpose of this class is to execute these post-processing tasks.
14
+ class PostProcessor
15
+ ##
16
+ #This is just a convenience method do call all the postprocessing
17
+ #functionality and checks
18
+ def self.apply_post_processing(root_pattern)
19
+ ensure_presence_of_pattern_full(root_pattern)
20
+ remove_multiple_filter_duplicates(root_pattern) if root_pattern.children[0].filters.size > 1
21
+ report_if_no_results(root_pattern) if root_pattern.evaluation_context.extractor.get_mode != :production
22
+ end
23
+
24
+ ##
25
+ #Apply the ensure_presence_of_pattern constraint on
26
+ #the full extractor
27
+ def self.ensure_presence_of_pattern_full(pattern)
28
+ ensure_presence_of_pattern(pattern)
29
+ pattern.children.each {|child| ensure_presence_of_pattern_full(child)}
30
+ end
31
+
32
+ ##
33
+ #Remove unneeded results of a pattern (caused by evaluating multiple filters)
34
+ #See for example the B&N scenario - the book titles are extracted two times
35
+ #for every pattern (since both examples generate the same XPath for them)
36
+ #but since always only one of the results has a price, the other is discarded
37
+ def self.remove_multiple_filter_duplicates(pattern)
38
+ remove_multiple_filter_duplicates_intern(pattern) if pattern.parent_of_leaf
39
+ pattern.children.each {|child| remove_multiple_filter_duplicates(child)}
40
+ end
41
+
42
+ ##
43
+ #Issue an error report if the document did not extract anything.
44
+ #Probably this is because the structure of the page changed or
45
+ #because of some rather nasty bug - in any case, something wrong
46
+ #is going on, and we need to inform the user about this!
47
+ def self.report_if_no_results(root_pattern)
48
+ results_found = false
49
+ root_pattern.children.each {|child| return if (child.result.childmap.size > 0)}
50
+
51
+ Scrubyt.log :WARNING, [
52
+ "The extractor did not find any result instances. Most probably this is wrong.",
53
+ "Check your extractor and if you are sure it should work, report a bug!"
54
+ ]
55
+ end
56
+
57
+ private
58
+ def self.ensure_presence_of_pattern(pattern)
59
+ #holds the name of those child patterns which have to be present as children of the input parameter
60
+ epop_names = pattern.constraints.select {|c| c.type == Scrubyt::Constraint::CONSTRAINT_TYPE_ENSURE_PRESENCE_OF_PATTERN}.map {|c| c.target}
61
+ return if epop_names.empty?
62
+ #all_parent_values holds instances extracted by pattern
63
+ all_parent_values = []
64
+ pattern.result.childmap.each { |h| all_parent_values << h.values }
65
+ all_parent_values.flatten!
66
+ #indices of result instances (of pattern) we are going to remove
67
+ results_to_remove = Set.new
68
+ pattern.children.each do |child_pattern|
69
+ #all_child_values holds instances extracted by child_pattern
70
+ all_child_values = []
71
+ child_pattern.result.childmap.each { |h| all_child_values << h.values }
72
+ all_child_values.flatten!
73
+
74
+ #populate results_to_remove
75
+ i = 0
76
+ all_parent_values.each do |parent_value|
77
+ #Hey! Not just the direct children but all the ancestors
78
+ @found_ancestor = false
79
+ check_ancestors(parent_value, all_child_values)
80
+
81
+ results_to_remove << i if (!@found_ancestor && (epop_names.include? child_pattern.name))
82
+ i += 1
83
+ end
84
+ end
85
+ #based on results_to_remove, populate the array 'rejected' which holds the actual instances
86
+ #(and not indices, as in the case of results_to_remove!). In other words, we are mapping
87
+ #results_to_remove indices to their actual instances
88
+ rejected = []
89
+ i = -1
90
+ pattern.result.childmap.each do |h|
91
+ h.each { |k,v| rejected = v.reject {|e| i += 1; !results_to_remove.include? i } }
92
+ end
93
+
94
+ #Finally, do the actual delete!
95
+ pattern.result.childmap.each { |h| h.each { |k,v| rejected.each { |r| v.delete(r)} } }
96
+ end
97
+
98
+ def self.check_ancestors(parent_value, all_child_values)
99
+ parent_value.children.each { |child| @found_ancestor = true if all_child_values.include? child } if
100
+ parent_value.is_a? Hpricot::Elem
101
+ parent_value.children.each { |child| check_ancestors(child, all_child_values) if child.is_a? Hpricot::Elem } if parent_value.is_a? Hpricot::Elem
102
+ end
103
+
104
+ def self.remove_multiple_filter_duplicates_intern(pattern)
105
+ possible_duplicates = {}
106
+ longest_result = 0
107
+ pattern.result.childmap.each { |r|
108
+ r.each do |k,v|
109
+ v.each do |x|
110
+ all_child_results = []
111
+ pattern.children.each { |child|
112
+ temp_res = child.result.lookup(x)
113
+ all_child_results << temp_res if temp_res != nil
114
+ }
115
+ next if all_child_results.size <= 1
116
+ longest_result = all_child_results.map {|e| e.size}.max
117
+ all_child_results.each { |r| (r.size+1).upto(longest_result) { r << nil } }
118
+ possible_duplicates[x] = all_child_results.transpose
119
+ end
120
+ end
121
+ }
122
+ #Determine the 'real' duplicates
123
+ real_duplicates = {}
124
+ possible_duplicates.each { |k,v|
125
+ next if v.size == 1
126
+ v.each { |r| real_duplicates[k] = r }
127
+ }
128
+
129
+ #Finally, remove them!
130
+ pattern.children.each { |child|
131
+ child.result.childmap.each { |r|
132
+ r.each { |k,v|
133
+ real_duplicates[k].each {|e| v.delete e} if real_duplicates.keys.include? k
134
+ }
135
+ }
136
+ }
137
+ end #end of function
138
+ end #end of class PostProcessor
139
+ end #end of module Scrubyt
@@ -0,0 +1,44 @@
1
+ ########################################## NOT USED ANY MORE ##########################################
2
+ module Scrubyt
3
+ ##
4
+ #=<tt>Represents the results of a pattern</tt>
5
+ class Result
6
+ attr_reader :childmap, :instances
7
+
8
+ def initialize
9
+ @childmap ||= []
10
+ end
11
+
12
+ def add_result(source, result)
13
+ @childmap.each do |hash|
14
+ if hash.keys[0] == source
15
+ hash[source] << result if !hash[source].include? result
16
+ return
17
+ end
18
+ end
19
+ @childmap << {source => [result]}
20
+ end
21
+
22
+ def lookup(last_result)
23
+ @childmap.each do |hashes|
24
+ hashes.each { |key, value| return value if (key == last_result) }
25
+ end
26
+ nil
27
+ end#end of method lookup
28
+ end#end of class Result
29
+ end#end of module Scrubyt
30
+
31
+ #It roughly works like this:
32
+ #
33
+ # root
34
+ # source: nil
35
+ # childmap: [ {doc1 => [doc1]}, {doc2 => [doc2]} ]
36
+
37
+ #table
38
+ # source: doc1
39
+ # childmap [ {doc1 => [table[1]s1, table[2]s1, table[3]s1]}, {doc2 => [table[1]s2, table[2]s2, table[3]s2]} ]
40
+
41
+ #row
42
+ # source: table1s1, table2s1, table3s1
43
+ # childmap: [ {table[1]s1 => [row1s1, row2s1]}, {table[2]s1 => [row3s1, row3s1, row5s1]},
44
+ # {table[1]s2 => [row1s2, row2s2]}, {table[2]s2 => [row3s2, row3s2, row5s2]}]
@@ -0,0 +1,154 @@
1
+ require 'rexml/document'
2
+ require 'rexml/xpath'
3
+
4
+ ########################################## NOT USED ANY MORE ##########################################
5
+ module Scrubyt
6
+ ##
7
+ #=<tt>Dumping the result in various formats and providing statistics on the results</tt>
8
+ class ResultDumper
9
+ ##
10
+ #Output the results as XML
11
+ def self.to_xml(pattern)
12
+ doc = REXML::Document.new
13
+ root = REXML::Element.new('root')
14
+ doc.add_element(root)
15
+ all_extracted_docs = pattern.last_result
16
+ [all_extracted_docs].flatten.each do |lr|
17
+ pattern.last_result = lr
18
+ to_xml_recursive(pattern, root)
19
+ end
20
+ remove_empty_leaves(doc)
21
+ @@last_doc = doc
22
+ end
23
+
24
+ def self.remove_empty_leaves(node)
25
+ node.remove if node.elements.empty? && node.text == nil
26
+ node.elements.each {|child| remove_empty_leaves child }
27
+ end
28
+
29
+ ##
30
+ #Output the text of the pattern; If this pattern is a tree, collect the text from its
31
+ #result instance node; otherwise rely on the last_result
32
+ #TODO: throw this away!!!
33
+ def self.to_text(pattern)
34
+ last_result = pattern.last_result
35
+ result = ""
36
+ if pattern.type == :tree
37
+ last_result.traverse_text { |t| result += t.to_s }
38
+ else
39
+ result = last_result
40
+ end
41
+ result
42
+ end
43
+
44
+ def self.to_csv(pattern)
45
+ result = []
46
+ flat_csv_inner = lambda {|e, parts|
47
+ content = e.text || ''
48
+ parts << content if ((e.is_a? REXML::Element) && content != '')
49
+ e.children.each {|c| flat_csv_inner.call(c, parts) if c.is_a? REXML::Element }
50
+ parts
51
+ }
52
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_csv_inner.call(e, []) }
53
+ (result.map! {|a| a.join(',')}).join("\n")
54
+ end
55
+
56
+ def self.to_hash(pattern)
57
+ result = []
58
+ flat_hash_inner = lambda {|e, parts|
59
+ content = e.text ? REXML::Text.unnormalize(e.text) : ''
60
+ if ((e.is_a? REXML::Element) && content != '')
61
+ if parts[e.local_name]
62
+ parts[e.local_name] = parts[e.local_name] + "," + content
63
+ else
64
+ parts[e.local_name] = content
65
+ end
66
+ end
67
+ e.children.each {|c| flat_hash_inner.call(c, parts) if c.is_a? REXML::Element }
68
+ parts
69
+ }
70
+ to_xml(pattern).root.elements['/root'].each {|e| result << flat_hash_inner.call(e, {}) }
71
+ result
72
+ end
73
+
74
+
75
+
76
+ ##
77
+ #Print some simple statistics on the extracted results, like the count of extracted
78
+ #instances by each pattern
79
+ def self.print_statistics(pattern)
80
+ puts "\n" * 2
81
+ print_statistics_recursive(pattern,0)
82
+ puts
83
+ end
84
+
85
+ private
86
+ def self.to_xml_recursive(pattern, element)
87
+ pattern.children.each do |child|
88
+ childresults = child.result.lookup(child.parent.last_result)
89
+ #Output text for leaf nodes only; Maybe add possibility to customize this later
90
+ if (childresults == nil)
91
+ ##TODO: is this needed for anything? I guess not! Drop it!!!!!!
92
+ #Update: it seems the blackbox tests are not passing because of this (?) so temporarily adding it back
93
+ ##=begin
94
+ res = ""
95
+ if child.parent.last_result.is_a? String
96
+ res = child.parent.last_result
97
+ else
98
+ child.parent.last_result.traverse_text { |t| res += t.to_s }
99
+ end
100
+ if (child.parent.respond_to?(:size) && child.parent.size == 0) #TODO: respond_to should not be used here, it's just a quick workaround
101
+ element.text = SharedUtils.unescape_entities(res).strip unless element.parent.is_a? REXML::Document
102
+ end
103
+ next
104
+ ##=end
105
+ end
106
+
107
+ generate_children(child, childresults, element)
108
+ end
109
+ end
110
+
111
+ def self.generate_children(child, childresults, element)
112
+ if childresults == nil
113
+ child_node = REXML::Element.new(child.name)
114
+ child_node.text = child.default
115
+ element.add_element(child_node)
116
+ else
117
+ childresults.size.times do |num|
118
+ child.last_result = childresults[num]
119
+ res = ""
120
+ if child.last_result.instance_of? String
121
+ res = child.last_result
122
+ else
123
+ if child.last_result.respond_to? 'traverse_text'
124
+ child.last_result.traverse_text { |t| res += t.to_s } if child.last_result != nil
125
+ else
126
+ child.last_result.children.each { |c| element.add_element c }
127
+ end
128
+ end
129
+ child_node = REXML::Element.new(child.name)
130
+ child_node.text = SharedUtils.unescape_entities(res).strip if child.write_text
131
+ element.add_element(child_node) if (child.type != :detail_page && child_node.text != '')
132
+ to_xml_recursive(child, child_node)
133
+ end
134
+ end
135
+ end
136
+
137
+ def self.print_statistics_recursive(pattern, depth)
138
+ if pattern.name != 'root'
139
+ if pattern.type == :detail_page
140
+ pattern.evaluation_context.extractor.get_detail_pattern_relations[pattern].parent.children.each do |child|
141
+ print_statistics_recursive(child, depth)
142
+ end
143
+ else
144
+ count = REXML::XPath.match(@@last_doc, "//#{pattern.name}").size
145
+ Scrubyt.log :INFO, (' ' * depth.to_i) + "#{pattern.name} extracted #{count} instances."
146
+ end
147
+ end
148
+
149
+ pattern.children.each do |child|
150
+ print_statistics_recursive(child, depth + 4)
151
+ end
152
+ end#end of method print_statistics_recursive
153
+ end #end of class ResultDumper
154
+ end #end of module Scrubyt