pstuteville-scrubyt 0.4.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (520) hide show
  1. data/CHANGELOG +355 -0
  2. data/COPYING +340 -0
  3. data/README.rdoc +121 -0
  4. data/Rakefile +120 -0
  5. data/VERSION +1 -0
  6. data/examples/README.txt +1 -0
  7. data/examples/events/delta/input.html +682 -0
  8. data/examples/events/delta/test.rb +16 -0
  9. data/examples/misc/auto_regex/input.html +22 -0
  10. data/examples/misc/auto_regex/test.rb +14 -0
  11. data/examples/misc/compound_example/advanced/test.rb +11 -0
  12. data/examples/misc/compound_example/advanced/tricky_compound.html +9 -0
  13. data/examples/misc/compound_example/regexp/regexp_compound.html +17 -0
  14. data/examples/misc/compound_example/regexp/test.rb +11 -0
  15. data/examples/misc/compound_example/simple/compound.html +5 -0
  16. data/examples/misc/compound_example/simple/test.rb +11 -0
  17. data/examples/misc/detail_page/detailpage.html +6 -0
  18. data/examples/misc/detail_page/index.html +9 -0
  19. data/examples/misc/detail_page/test.rb +17 -0
  20. data/examples/misc/google/test.rb +39 -0
  21. data/examples/misc/identical_examples/data_extractor_export.rb +12 -0
  22. data/examples/misc/identical_examples/input.html +16 -0
  23. data/examples/misc/identical_examples/test.rb +15 -0
  24. data/examples/misc/immediate_attribute_extraction/data_extractor_export.rb +10 -0
  25. data/examples/misc/immediate_attribute_extraction/input.html +16 -0
  26. data/examples/misc/immediate_attribute_extraction/test.rb +14 -0
  27. data/examples/misc/multiple_examples/input.html +7 -0
  28. data/examples/misc/multiple_examples/test.rb +22 -0
  29. data/examples/misc/on_click_next/next_page_link.rb +42 -0
  30. data/examples/misc/on_click_next/page_1.html +10 -0
  31. data/examples/misc/on_click_next/page_2.html +10 -0
  32. data/examples/misc/on_click_next/page_3.html +7 -0
  33. data/examples/misc/rubycorner/test.rb +29 -0
  34. data/examples/misc/rubyforge_login/test.rb +30 -0
  35. data/examples/misc/tables/ambigous_records/input.html +17 -0
  36. data/examples/misc/tables/ambigous_records/test.rb +37 -0
  37. data/examples/misc/tables/another_plain_table/input.html +15 -0
  38. data/examples/misc/tables/another_plain_table/test.rb +25 -0
  39. data/examples/misc/tables/complex_table/input.html +45 -0
  40. data/examples/misc/tables/complex_table/test.rb +30 -0
  41. data/examples/misc/tables/grab_rows/input.html +20 -0
  42. data/examples/misc/tables/grab_rows/test.rb +30 -0
  43. data/examples/misc/tables/plain_table/input.html +39 -0
  44. data/examples/misc/tables/plain_table/test.rb +35 -0
  45. data/examples/misc/tables/plain_table_morepages/2.html +38 -0
  46. data/examples/misc/tables/plain_table_morepages/3.html +33 -0
  47. data/examples/misc/tables/plain_table_morepages/input.html +40 -0
  48. data/examples/misc/tables/plain_table_morepages/test.rb +32 -0
  49. data/examples/misc/tables/plain_table_morepages_with_image/2.html +40 -0
  50. data/examples/misc/tables/plain_table_morepages_with_image/3.html +33 -0
  51. data/examples/misc/tables/plain_table_morepages_with_image/images/right_arrow.png +0 -0
  52. data/examples/misc/tables/plain_table_morepages_with_image/input.html +42 -0
  53. data/examples/misc/tables/plain_table_morepages_with_image/test.rb +32 -0
  54. data/examples/misc/tables/test_select_indices/input.html +46 -0
  55. data/examples/misc/tables/test_select_indices/test.rb +55 -0
  56. data/examples/misc/xpath_example_type/input.html +15 -0
  57. data/examples/misc/xpath_example_type/test.rb +18 -0
  58. data/examples/misc/yahoo_finance/test.rb +26 -0
  59. data/examples/social/blog_comment/test.rb +27 -0
  60. data/examples/social/del.icio.us/test.rb +22 -0
  61. data/examples/social/digg/test.rb +37 -0
  62. data/examples/social/dzone/test.rb +28 -0
  63. data/examples/social/linkedin/linkedin.rb +23 -0
  64. data/examples/social/reddit/test.rb +23 -0
  65. data/examples/tones_extractor_export.rb +23 -0
  66. data/examples/webshops/amazon/002-8212888-3924065.html +5311 -0
  67. data/examples/webshops/amazon/002-8212888-3924065_files/0130796034.jpg +0 -0
  68. data/examples/webshops/amazon/002-8212888-3924065_files/020161622X.jpg +0 -0
  69. data/examples/webshops/amazon/002-8212888-3924065_files/0321223675.jpg +0 -0
  70. data/examples/webshops/amazon/002-8212888-3924065_files/0465067107.jpg +0 -0
  71. data/examples/webshops/amazon/002-8212888-3924065_files/0470069155.jpg +0 -0
  72. data/examples/webshops/amazon/002-8212888-3924065_files/0470081201.jpg +0 -0
  73. data/examples/webshops/amazon/002-8212888-3924065_files/0596005253.jpg +0 -0
  74. data/examples/webshops/amazon/002-8212888-3924065_files/0596101325.jpg +0 -0
  75. data/examples/webshops/amazon/002-8212888-3924065_files/0596523696.jpg +0 -0
  76. data/examples/webshops/amazon/002-8212888-3924065_files/0672328844.jpg +0 -0
  77. data/examples/webshops/amazon/002-8212888-3924065_files/0764596861.jpg +0 -0
  78. data/examples/webshops/amazon/002-8212888-3924065_files/0974514055.jpg +0 -0
  79. data/examples/webshops/amazon/002-8212888-3924065_files/0976694069.jpg +0 -0
  80. data/examples/webshops/amazon/002-8212888-3924065_files/0977616606.jpg +0 -0
  81. data/examples/webshops/amazon/002-8212888-3924065_files/0977616614.jpg +0 -0
  82. data/examples/webshops/amazon/002-8212888-3924065_files/0977616630.jpg +0 -0
  83. data/examples/webshops/amazon/002-8212888-3924065_files/1590597362.jpg +0 -0
  84. data/examples/webshops/amazon/002-8212888-3924065_files/1594480060.jpg +0 -0
  85. data/examples/webshops/amazon/002-8212888-3924065_files/1932394699.jpg +0 -0
  86. data/examples/webshops/amazon/002-8212888-3924065_files/2841772101.jpg +0 -0
  87. data/examples/webshops/amazon/002-8212888-3924065_files/amzn-logo-118w.gif +0 -0
  88. data/examples/webshops/amazon/002-8212888-3924065_files/askville-adwidget-bullet.gif +0 -0
  89. data/examples/webshops/amazon/002-8212888-3924065_files/askville-logo-sm-adwidget-white-bg.gif +0 -0
  90. data/examples/webshops/amazon/002-8212888-3924065_files/book_display_on_website-icon.gif +0 -0
  91. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-no-ns.gif +0 -0
  92. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-no.gif +0 -0
  93. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-yes-ns.gif +0 -0
  94. data/examples/webshops/amazon/002-8212888-3924065_files/btn-inactive-yes.gif +0 -0
  95. data/examples/webshops/amazon/002-8212888-3924065_files/btn-no-tiny.gif +0 -0
  96. data/examples/webshops/amazon/002-8212888-3924065_files/btn-yes-tiny.gif +0 -0
  97. data/examples/webshops/amazon/002-8212888-3924065_files/buybox-button-find-gifts-a.gif +0 -0
  98. data/examples/webshops/amazon/002-8212888-3924065_files/c7y_badge_rn_1.gif +0 -0
  99. data/examples/webshops/amazon/002-8212888-3924065_files/cap-a9-3.gif +0 -0
  100. data/examples/webshops/amazon/002-8212888-3924065_files/drop-down-icon-small-arrow.gif +0 -0
  101. data/examples/webshops/amazon/002-8212888-3924065_files/endcap-a9-go-2.gif +0 -0
  102. data/examples/webshops/amazon/002-8212888-3924065_files/gb-open-new.gif +0 -0
  103. data/examples/webshops/amazon/002-8212888-3924065_files/gc-logo-popover-a.gif +0 -0
  104. data/examples/webshops/amazon/002-8212888-3924065_files/gift-cert-roto-pop-a.gif +0 -0
  105. data/examples/webshops/amazon/002-8212888-3924065_files/go-button-books.gif +0 -0
  106. data/examples/webshops/amazon/002-8212888-3924065_files/go-button.gif +0 -0
  107. data/examples/webshops/amazon/002-8212888-3924065_files/go-orange-trans.gif +0 -0
  108. data/examples/webshops/amazon/002-8212888-3924065_files/go_button_photo.gif +0 -0
  109. data/examples/webshops/amazon/002-8212888-3924065_files/logo-off.gif +0 -0
  110. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-events-18134.js +1407 -0
  111. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-n2v1-57871.css +364 -0
  112. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-simplePopover-41153.js +749 -0
  113. data/examples/webshops/amazon/002-8212888-3924065_files/n2CoreLibs-utilities-25439.js +1608 -0
  114. data/examples/webshops/amazon/002-8212888-3924065_files/orange-arrow.gif +0 -0
  115. data/examples/webshops/amazon/002-8212888-3924065_files/orange-arrow_002.gif +0 -0
  116. data/examples/webshops/amazon/002-8212888-3924065_files/popover-blurb.gif +0 -0
  117. data/examples/webshops/amazon/002-8212888-3924065_files/powered-by-a9.gif +0 -0
  118. data/examples/webshops/amazon/002-8212888-3924065_files/stars-3-5.gif +0 -0
  119. data/examples/webshops/amazon/002-8212888-3924065_files/stars-4-0.gif +0 -0
  120. data/examples/webshops/amazon/002-8212888-3924065_files/stars-4-5.gif +0 -0
  121. data/examples/webshops/amazon/002-8212888-3924065_files/stars-5-0.gif +0 -0
  122. data/examples/webshops/amazon/002-8212888-3924065_files/tagline-adwidget.gif +0 -0
  123. data/examples/webshops/amazon/002-8212888-3924065_files/topnav-cart.gif +0 -0
  124. data/examples/webshops/amazon/002-8212888-3924065_files/transparent-pixel.gif +0 -0
  125. data/examples/webshops/amazon/002-8212888-3924065_files/transparent-pixel_002.gif +0 -0
  126. data/examples/webshops/amazon/test.rb +27 -0
  127. data/examples/webshops/amazon-online/test.rb +34 -0
  128. data/examples/webshops/barnes_and_noble/test.rb +32 -0
  129. data/examples/webshops/barnes_offline/barnes_and_noble.html +115 -0
  130. data/examples/webshops/barnes_offline/barnes_and_noble_files/10964080.gif +0 -0
  131. data/examples/webshops/barnes_offline/barnes_and_noble_files/10999676.gif +0 -0
  132. data/examples/webshops/barnes_offline/barnes_and_noble_files/11018492.gif +0 -0
  133. data/examples/webshops/barnes_offline/barnes_and_noble_files/11656534.gif +0 -0
  134. data/examples/webshops/barnes_offline/barnes_and_noble_files/11985045.gif +0 -0
  135. data/examples/webshops/barnes_offline/barnes_and_noble_files/12052378.gif +0 -0
  136. data/examples/webshops/barnes_offline/barnes_and_noble_files/12138286.gif +0 -0
  137. data/examples/webshops/barnes_offline/barnes_and_noble_files/12533212.gif +0 -0
  138. data/examples/webshops/barnes_offline/barnes_and_noble_files/12533268.gif +0 -0
  139. data/examples/webshops/barnes_offline/barnes_and_noble_files/9583392.gif +0 -0
  140. data/examples/webshops/barnes_offline/barnes_and_noble_files/SearchProduct.css +626 -0
  141. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin3_gtpointup.gif +0 -0
  142. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_aboutshipping.gif +0 -0
  143. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_account.gif +0 -0
  144. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_colon.gif +0 -0
  145. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_giftreminder.gif +0 -0
  146. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_help.gif +0 -0
  147. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_orderstatus.gif +0 -0
  148. data/examples/webshops/barnes_offline/barnes_and_noble_files/admin_wishlist.gif +0 -0
  149. data/examples/webshops/barnes_offline/barnes_and_noble_files/bg.gif +0 -0
  150. data/examples/webshops/barnes_offline/barnes_and_noble_files/btnGoGrn.gif +0 -0
  151. data/examples/webshops/barnes_offline/barnes_and_noble_files/cleardot.gif +0 -0
  152. data/examples/webshops/barnes_offline/barnes_and_noble_files/cleardot_002.gif +0 -0
  153. data/examples/webshops/barnes_offline/barnes_and_noble_files/dot4.gif +0 -0
  154. data/examples/webshops/barnes_offline/barnes_and_noble_files/dotGold20.gif +0 -0
  155. data/examples/webshops/barnes_offline/barnes_and_noble_files/hdCantFind.gif +0 -0
  156. data/examples/webshops/barnes_offline/barnes_and_noble_files/hdSearchResults.gif +0 -0
  157. data/examples/webshops/barnes_offline/barnes_and_noble_files/hgg_tab_home_cold.gif +0 -0
  158. data/examples/webshops/barnes_offline/barnes_and_noble_files/hgg_tab_toy_cold.gif +0 -0
  159. data/examples/webshops/barnes_offline/barnes_and_noble_files/iframeKMP.js +172 -0
  160. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2.html +25 -0
  161. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/070226_mc_lnav_search.gif +0 -0
  162. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/XmlUtil.js +199 -0
  163. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/XslStyleSheet.js +1 -0
  164. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/kmp_gen.css +81 -0
  165. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/product-preview-core.js +337 -0
  166. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/product-preview.css +36 -0
  167. data/examples/webshops/barnes_offline/barnes_and_noble_files/kmp_iframe_cds2_data/productpreview.js +11 -0
  168. data/examples/webshops/barnes_offline/barnes_and_noble_files/linePagination.gif +0 -0
  169. data/examples/webshops/barnes_offline/barnes_and_noble_files/logo_bn05.gif +0 -0
  170. data/examples/webshops/barnes_offline/barnes_and_noble_files/navbar.js +34 -0
  171. data/examples/webshops/barnes_offline/barnes_and_noble_files/navbar_06.css +136 -0
  172. data/examples/webshops/barnes_offline/barnes_and_noble_files/popup_open.js +116 -0
  173. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch3_vline_dots.gif +0 -0
  174. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch4_search.gif +0 -0
  175. data/examples/webshops/barnes_offline/barnes_and_noble_files/qsearch_AdvSearch.jpg +0 -0
  176. data/examples/webshops/barnes_offline/barnes_and_noble_files/subnav_colon.gif +0 -0
  177. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_Bookclubs_cold.gif +0 -0
  178. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_bnjr_cold.gif +0 -0
  179. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_books_hot.gif +0 -0
  180. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_dvd_cold.gif +0 -0
  181. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_giftcards_cold.gif +0 -0
  182. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_home_cold.gif +0 -0
  183. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_member_cc_cold.gif +0 -0
  184. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_music_cold.gif +0 -0
  185. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_pipe.gif +0 -0
  186. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_textbooksonly_cold.gif +0 -0
  187. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_usedoop_cold.gif +0 -0
  188. data/examples/webshops/barnes_offline/barnes_and_noble_files/tab_videogames_cold.gif +0 -0
  189. data/examples/webshops/barnes_offline/barnes_and_noble_files/toppromo3_rule.gif +0 -0
  190. data/examples/webshops/barnes_offline/barnes_and_noble_files/toppromo_fastfree05.gif +0 -0
  191. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_btn_checkout.gif +0 -0
  192. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_icon_cart.gif +0 -0
  193. data/examples/webshops/barnes_offline/barnes_and_noble_files/vcart_topbot_rule.gif +0 -0
  194. data/examples/webshops/barnes_offline/barnes_and_noble_files/visualcart_prodid.js +401 -0
  195. data/examples/webshops/barnes_offline/test.rb +30 -0
  196. data/examples/webshops/buydig/2_files/03AA1BB9089A4A6A92CF23F280D664EB.jpg +0 -0
  197. data/examples/webshops/buydig/2_files/1008.gif +0 -0
  198. data/examples/webshops/buydig/2_files/1013.gif +0 -0
  199. data/examples/webshops/buydig/2_files/1020.gif +0 -0
  200. data/examples/webshops/buydig/2_files/106CF2FB84B446518397517C3E6D5AD8.jpg +0 -0
  201. data/examples/webshops/buydig/2_files/13-www.gif +0 -0
  202. data/examples/webshops/buydig/2_files/1E9BB2E56AB145FC9D6EF952703AF476.jpg +0 -0
  203. data/examples/webshops/buydig/2_files/1FCDFBE85CDB4D429EC2C8CB24D20457.jpg +0 -0
  204. data/examples/webshops/buydig/2_files/1pix.gif +0 -0
  205. data/examples/webshops/buydig/2_files/2014.gif +0 -0
  206. data/examples/webshops/buydig/2_files/2089.gif +0 -0
  207. data/examples/webshops/buydig/2_files/24992_medal.gif +0 -0
  208. data/examples/webshops/buydig/2_files/24BBCBA1397F4DDCBBBBE8456D6D6E5B.jpg +0 -0
  209. data/examples/webshops/buydig/2_files/281F8A6019B140F38DFD45EB7B69B0FB.jpg +0 -0
  210. data/examples/webshops/buydig/2_files/2975F866CB2149F7ACBC559C8E24E304.jpg +0 -0
  211. data/examples/webshops/buydig/2_files/316FC9256DC9460ABC3C5ECAF6C60286.jpg +0 -0
  212. data/examples/webshops/buydig/2_files/50569327D8B94252B95E449AE470E505.jpg +0 -0
  213. data/examples/webshops/buydig/2_files/519CDAB404FA4543B76B5F281468ACBF.jpg +0 -0
  214. data/examples/webshops/buydig/2_files/57D6146419A647BA89C96AF0B5CAB03C.jpg +0 -0
  215. data/examples/webshops/buydig/2_files/58E3F988E184448B8C0A59874AE123A8.jpg +0 -0
  216. data/examples/webshops/buydig/2_files/5E5B10197A4E4C9B9ECCD6309DBE4C54.jpg +0 -0
  217. data/examples/webshops/buydig/2_files/609A249177D04065B37B9161CB0BC92D.jpg +0 -0
  218. data/examples/webshops/buydig/2_files/676CEE8E53C2445982E991871B4DF613.jpg +0 -0
  219. data/examples/webshops/buydig/2_files/712BA08FAB524A31A76ABB9E2009FF8E.jpg +0 -0
  220. data/examples/webshops/buydig/2_files/734BD08D7A5049339393166491D09D21.jpg +0 -0
  221. data/examples/webshops/buydig/2_files/751E72B7003343248497FE6905F80787.jpg +0 -0
  222. data/examples/webshops/buydig/2_files/76493D4F02F14EF7B5886510604C7BB4.jpg +0 -0
  223. data/examples/webshops/buydig/2_files/79521E251278486DB29529C60C9D012A.jpg +0 -0
  224. data/examples/webshops/buydig/2_files/9C9AF82AC3B54BDC8C705278B50FDFD6.jpg +0 -0
  225. data/examples/webshops/buydig/2_files/BC3FD8307B9948FDB7EEF156D8629C37.jpg +0 -0
  226. data/examples/webshops/buydig/2_files/C0DD4574765047D1836F505E69DC8AE5.jpg +0 -0
  227. data/examples/webshops/buydig/2_files/C143F48515274A44B04F4B3E46306BD2.jpg +0 -0
  228. data/examples/webshops/buydig/2_files/C6B02E88F729464699DB275D140F4563.jpg +0 -0
  229. data/examples/webshops/buydig/2_files/CE334D6206DB4FA9AFDF339AEF0AF50F.jpg +0 -0
  230. data/examples/webshops/buydig/2_files/D66AE0DC865A4021AB300ED3A0C4CD11.jpg +0 -0
  231. data/examples/webshops/buydig/2_files/DEA2EC2093DC474D96B651068576DAE5.jpg +0 -0
  232. data/examples/webshops/buydig/2_files/F547677D83844042BF13A4BE6523BB50.jpg +0 -0
  233. data/examples/webshops/buydig/2_files/Rbbbonlineseal.gif +0 -0
  234. data/examples/webshops/buydig/2_files/TopSellers_bottom.gif +0 -0
  235. data/examples/webshops/buydig/2_files/TopSellers_ttl.gif +0 -0
  236. data/examples/webshops/buydig/2_files/addToFavorites_ttl.gif +0 -0
  237. data/examples/webshops/buydig/2_files/banner_CorporateSales.gif +0 -0
  238. data/examples/webshops/buydig/2_files/banner_Shipping.gif +0 -0
  239. data/examples/webshops/buydig/2_files/bizratehonoree.gif +0 -0
  240. data/examples/webshops/buydig/2_files/btn_submit.gif +0 -0
  241. data/examples/webshops/buydig/2_files/checkFlash.js +110 -0
  242. data/examples/webshops/buydig/2_files/checkFlash2.js +109 -0
  243. data/examples/webshops/buydig/2_files/cnetcertified.gif +0 -0
  244. data/examples/webshops/buydig/2_files/credPriceGrabber.gif +0 -0
  245. data/examples/webshops/buydig/2_files/credShopping.gif +0 -0
  246. data/examples/webshops/buydig/2_files/credential_paypal.gif +0 -0
  247. data/examples/webshops/buydig/2_files/credentials.gif +0 -0
  248. data/examples/webshops/buydig/2_files/dealtime.gif +0 -0
  249. data/examples/webshops/buydig/2_files/dvxstyle.css +754 -0
  250. data/examples/webshops/buydig/2_files/footer_021306_1_v1.gif +0 -0
  251. data/examples/webshops/buydig/2_files/func.js +132 -0
  252. data/examples/webshops/buydig/2_files/getseal +1 -0
  253. data/examples/webshops/buydig/2_files/help.gif +0 -0
  254. data/examples/webshops/buydig/2_files/home.gif +0 -0
  255. data/examples/webshops/buydig/2_files/java.js +155 -0
  256. data/examples/webshops/buydig/2_files/leftnv_help.gif +0 -0
  257. data/examples/webshops/buydig/2_files/logo.gif +0 -0
  258. data/examples/webshops/buydig/2_files/logo2.gif +0 -0
  259. data/examples/webshops/buydig/2_files/logo3.gif +0 -0
  260. data/examples/webshops/buydig/2_files/main.js +227 -0
  261. data/examples/webshops/buydig/2_files/mastercard_secured.gif +0 -0
  262. data/examples/webshops/buydig/2_files/newsBox_bkg.jpg +0 -0
  263. data/examples/webshops/buydig/2_files/newsBox_bottom.jpg +0 -0
  264. data/examples/webshops/buydig/2_files/newsBox_text.gif +0 -0
  265. data/examples/webshops/buydig/2_files/newsBox_ttl.jpg +0 -0
  266. data/examples/webshops/buydig/2_files/noimage75.gif +0 -0
  267. data/examples/webshops/buydig/2_files/orangeleftfrmbtm.gif +0 -0
  268. data/examples/webshops/buydig/2_files/pixel153.gif +0 -0
  269. data/examples/webshops/buydig/2_files/rightnv_bottom.gif +0 -0
  270. data/examples/webshops/buydig/2_files/search_btn_off.gif +0 -0
  271. data/examples/webshops/buydig/2_files/search_c1.gif +0 -0
  272. data/examples/webshops/buydig/2_files/search_c2.gif +0 -0
  273. data/examples/webshops/buydig/2_files/search_c3.gif +0 -0
  274. data/examples/webshops/buydig/2_files/search_c4.gif +0 -0
  275. data/examples/webshops/buydig/2_files/search_down.gif +0 -0
  276. data/examples/webshops/buydig/2_files/search_left.gif +0 -0
  277. data/examples/webshops/buydig/2_files/search_right.gif +0 -0
  278. data/examples/webshops/buydig/2_files/search_top.gif +0 -0
  279. data/examples/webshops/buydig/2_files/siteLinks_bottom.gif +0 -0
  280. data/examples/webshops/buydig/2_files/siteLinks_bullet.gif +0 -0
  281. data/examples/webshops/buydig/2_files/siteLinks_ttl.gif +0 -0
  282. data/examples/webshops/buydig/2_files/spacer.gif +0 -0
  283. data/examples/webshops/buydig/2_files/style.js +45 -0
  284. data/examples/webshops/buydig/2_files/styles.html +33 -0
  285. data/examples/webshops/buydig/2_files/track_orders.jpg +0 -0
  286. data/examples/webshops/buydig/2_files/urchin +534 -0
  287. data/examples/webshops/buydig/2_files/verified_by_visa.gif +0 -0
  288. data/examples/webshops/buydig/2_files/welcome.gif +0 -0
  289. data/examples/webshops/buydig/2_files/welcome_ttl.gif +0 -0
  290. data/examples/webshops/buydig/2_files/yahoologo.gif +0 -0
  291. data/examples/webshops/buydig/input.html +1194 -0
  292. data/examples/webshops/buydig/test.rb +31 -0
  293. data/examples/webshops/ebay/test.rb +32 -0
  294. data/examples/webshops/finewines_offline/_finewines.html +1739 -0
  295. data/examples/webshops/finewines_offline/_finewines_cut.html +371 -0
  296. data/examples/webshops/finewines_offline/_finewines_files/011064.jpg +0 -0
  297. data/examples/webshops/finewines_offline/_finewines_files/012674.jpg +0 -0
  298. data/examples/webshops/finewines_offline/_finewines_files/013268.jpg +0 -0
  299. data/examples/webshops/finewines_offline/_finewines_files/013300.jpg +0 -0
  300. data/examples/webshops/finewines_offline/_finewines_files/013409.jpg +0 -0
  301. data/examples/webshops/finewines_offline/_finewines_files/014340.jpg +0 -0
  302. data/examples/webshops/finewines_offline/_finewines_files/015073.jpg +0 -0
  303. data/examples/webshops/finewines_offline/_finewines_files/015255.jpg +0 -0
  304. data/examples/webshops/finewines_offline/_finewines_files/015479.jpg +0 -0
  305. data/examples/webshops/finewines_offline/_finewines_files/015487.jpg +0 -0
  306. data/examples/webshops/finewines_offline/_finewines_files/017038.jpg +0 -0
  307. data/examples/webshops/finewines_offline/_finewines_files/017129.jpg +0 -0
  308. data/examples/webshops/finewines_offline/_finewines_files/017145.jpg +0 -0
  309. data/examples/webshops/finewines_offline/_finewines_files/017152.jpg +0 -0
  310. data/examples/webshops/finewines_offline/_finewines_files/017285.jpg +0 -0
  311. data/examples/webshops/finewines_offline/_finewines_files/017392.jpg +0 -0
  312. data/examples/webshops/finewines_offline/_finewines_files/017400.jpg +0 -0
  313. data/examples/webshops/finewines_offline/_finewines_files/019778.jpg +0 -0
  314. data/examples/webshops/finewines_offline/_finewines_files/019786.jpg +0 -0
  315. data/examples/webshops/finewines_offline/_finewines_files/020503.jpg +0 -0
  316. data/examples/webshops/finewines_offline/_finewines_files/021253.jpg +0 -0
  317. data/examples/webshops/finewines_offline/_finewines_files/021279.jpg +0 -0
  318. data/examples/webshops/finewines_offline/_finewines_files/021337.jpg +0 -0
  319. data/examples/webshops/finewines_offline/_finewines_files/021352.jpg +0 -0
  320. data/examples/webshops/finewines_offline/_finewines_files/023002.jpg +0 -0
  321. data/examples/webshops/finewines_offline/_finewines_files/023135.jpg +0 -0
  322. data/examples/webshops/finewines_offline/_finewines_files/023143.jpg +0 -0
  323. data/examples/webshops/finewines_offline/_finewines_files/023788.jpg +0 -0
  324. data/examples/webshops/finewines_offline/_finewines_files/024166.jpg +0 -0
  325. data/examples/webshops/finewines_offline/_finewines_files/024182.jpg +0 -0
  326. data/examples/webshops/finewines_offline/_finewines_files/024216.jpg +0 -0
  327. data/examples/webshops/finewines_offline/_finewines_files/027268.jpg +0 -0
  328. data/examples/webshops/finewines_offline/_finewines_files/027516.jpg +0 -0
  329. data/examples/webshops/finewines_offline/_finewines_files/027862.jpg +0 -0
  330. data/examples/webshops/finewines_offline/_finewines_files/028118.jpg +0 -0
  331. data/examples/webshops/finewines_offline/_finewines_files/028936.jpg +0 -0
  332. data/examples/webshops/finewines_offline/_finewines_files/033894.jpg +0 -0
  333. data/examples/webshops/finewines_offline/_finewines_files/033902.jpg +0 -0
  334. data/examples/webshops/finewines_offline/_finewines_files/033910.jpg +0 -0
  335. data/examples/webshops/finewines_offline/_finewines_files/033936.jpg +0 -0
  336. data/examples/webshops/finewines_offline/_finewines_files/033944.jpg +0 -0
  337. data/examples/webshops/finewines_offline/_finewines_files/033951.jpg +0 -0
  338. data/examples/webshops/finewines_offline/_finewines_files/034553.jpg +0 -0
  339. data/examples/webshops/finewines_offline/_finewines_files/034561.jpg +0 -0
  340. data/examples/webshops/finewines_offline/_finewines_files/232439.jpg +0 -0
  341. data/examples/webshops/finewines_offline/_finewines_files/237834.jpg +0 -0
  342. data/examples/webshops/finewines_offline/_finewines_files/268359.jpg +0 -0
  343. data/examples/webshops/finewines_offline/_finewines_files/289082.jpg +0 -0
  344. data/examples/webshops/finewines_offline/_finewines_files/331603.jpg +0 -0
  345. data/examples/webshops/finewines_offline/_finewines_files/369686.jpg +0 -0
  346. data/examples/webshops/finewines_offline/_finewines_files/420257.jpg +0 -0
  347. data/examples/webshops/finewines_offline/_finewines_files/422014.jpg +0 -0
  348. data/examples/webshops/finewines_offline/_finewines_files/460410.jpg +0 -0
  349. data/examples/webshops/finewines_offline/_finewines_files/480533.jpg +0 -0
  350. data/examples/webshops/finewines_offline/_finewines_files/556795.jpg +0 -0
  351. data/examples/webshops/finewines_offline/_finewines_files/597054.jpg +0 -0
  352. data/examples/webshops/finewines_offline/_finewines_files/650606.jpg +0 -0
  353. data/examples/webshops/finewines_offline/_finewines_files/652628.jpg +0 -0
  354. data/examples/webshops/finewines_offline/_finewines_files/653790.jpg +0 -0
  355. data/examples/webshops/finewines_offline/_finewines_files/658450.jpg +0 -0
  356. data/examples/webshops/finewines_offline/_finewines_files/660027.jpg +0 -0
  357. data/examples/webshops/finewines_offline/_finewines_files/660951.jpg +0 -0
  358. data/examples/webshops/finewines_offline/_finewines_files/684514.jpg +0 -0
  359. data/examples/webshops/finewines_offline/_finewines_files/685131.jpg +0 -0
  360. data/examples/webshops/finewines_offline/_finewines_files/686865.jpg +0 -0
  361. data/examples/webshops/finewines_offline/_finewines_files/699330.jpg +0 -0
  362. data/examples/webshops/finewines_offline/_finewines_files/703017.jpg +0 -0
  363. data/examples/webshops/finewines_offline/_finewines_files/703140.jpg +0 -0
  364. data/examples/webshops/finewines_offline/_finewines_files/703850.jpg +0 -0
  365. data/examples/webshops/finewines_offline/_finewines_files/717306.jpg +0 -0
  366. data/examples/webshops/finewines_offline/_finewines_files/900274.jpg +0 -0
  367. data/examples/webshops/finewines_offline/_finewines_files/938225.jpg +0 -0
  368. data/examples/webshops/finewines_offline/_finewines_files/947440.jpg +0 -0
  369. data/examples/webshops/finewines_offline/_finewines_files/951319.jpg +0 -0
  370. data/examples/webshops/finewines_offline/_finewines_files/967893.jpg +0 -0
  371. data/examples/webshops/finewines_offline/_finewines_files/981407.jpg +0 -0
  372. data/examples/webshops/finewines_offline/_finewines_files/981613.jpg +0 -0
  373. data/examples/webshops/finewines_offline/_finewines_files/982421.jpg +0 -0
  374. data/examples/webshops/finewines_offline/_finewines_files/985598.jpg +0 -0
  375. data/examples/webshops/finewines_offline/_finewines_files/986737.jpg +0 -0
  376. data/examples/webshops/finewines_offline/_finewines_files/987503.jpg +0 -0
  377. data/examples/webshops/finewines_offline/_finewines_files/992800.jpg +0 -0
  378. data/examples/webshops/finewines_offline/_finewines_files/VintageslogoEN.gif +0 -0
  379. data/examples/webshops/finewines_offline/_finewines_files/blanc-up.gif +0 -0
  380. data/examples/webshops/finewines_offline/_finewines_files/btn_vintages_latest.gif +0 -0
  381. data/examples/webshops/finewines_offline/_finewines_files/cc_en.gif +0 -0
  382. data/examples/webshops/finewines_offline/_finewines_files/common.js +860 -0
  383. data/examples/webshops/finewines_offline/_finewines_files/drink.gif +0 -0
  384. data/examples/webshops/finewines_offline/_finewines_files/drinkhold.gif +0 -0
  385. data/examples/webshops/finewines_offline/_finewines_files/ec_en.gif +0 -0
  386. data/examples/webshops/finewines_offline/_finewines_files/ev_en.gif +0 -0
  387. data/examples/webshops/finewines_offline/_finewines_files/hold.gif +0 -0
  388. data/examples/webshops/finewines_offline/_finewines_files/index-wines-features.jpg +0 -0
  389. data/examples/webshops/finewines_offline/_finewines_files/indexSTYLE.css +398 -0
  390. data/examples/webshops/finewines_offline/_finewines_files/keyword_search.gif +0 -0
  391. data/examples/webshops/finewines_offline/_finewines_files/mm_menu.js +1 -0
  392. data/examples/webshops/finewines_offline/_finewines_files/nr_en.gif +0 -0
  393. data/examples/webshops/finewines_offline/_finewines_files/ontario_en.gif +0 -0
  394. data/examples/webshops/finewines_offline/_finewines_files/popup.js +81 -0
  395. data/examples/webshops/finewines_offline/_finewines_files/releases_nav.js +229 -0
  396. data/examples/webshops/finewines_offline/_finewines_files/so_en.gif +0 -0
  397. data/examples/webshops/finewines_offline/_finewines_files/spacer.gif +0 -0
  398. data/examples/webshops/finewines_offline/_finewines_files/top.gif +0 -0
  399. data/examples/webshops/finewines_offline/_finewines_files/urchin.js +576 -0
  400. data/examples/webshops/finewines_offline/_finewines_files/wom_en.gif +0 -0
  401. data/examples/webshops/finewines_offline/test.rb +30 -0
  402. data/examples/webshops/us1camera/1_files/1pix.gif +0 -0
  403. data/examples/webshops/us1camera/1_files/1pix_002.gif +0 -0
  404. data/examples/webshops/us1camera/1_files/CnetCertified.gif +0 -0
  405. data/examples/webshops/us1camera/1_files/CyberSource.gif +0 -0
  406. data/examples/webshops/us1camera/1_files/Images50.gif +0 -0
  407. data/examples/webshops/us1camera/1_files/Images50_002.gif +0 -0
  408. data/examples/webshops/us1camera/1_files/Images50_003.gif +0 -0
  409. data/examples/webshops/us1camera/1_files/Images50_004.gif +0 -0
  410. data/examples/webshops/us1camera/1_files/Images50_005.gif +0 -0
  411. data/examples/webshops/us1camera/1_files/Images50_006.gif +0 -0
  412. data/examples/webshops/us1camera/1_files/PriceGrabber.gif +0 -0
  413. data/examples/webshops/us1camera/1_files/QSearch.gif +0 -0
  414. data/examples/webshops/us1camera/1_files/ban-m.jpg +0 -0
  415. data/examples/webshops/us1camera/1_files/banner1.bin +0 -0
  416. data/examples/webshops/us1camera/1_files/banner3.bin +0 -0
  417. data/examples/webshops/us1camera/1_files/block1.jpg +0 -0
  418. data/examples/webshops/us1camera/1_files/block2.jpg +0 -0
  419. data/examples/webshops/us1camera/1_files/block3.jpg +0 -0
  420. data/examples/webshops/us1camera/1_files/block4.jpg +0 -0
  421. data/examples/webshops/us1camera/1_files/block5.jpg +0 -0
  422. data/examples/webshops/us1camera/1_files/block6.jpg +0 -0
  423. data/examples/webshops/us1camera/1_files/bos.js +280 -0
  424. data/examples/webshops/us1camera/1_files/box1.jpg +0 -0
  425. data/examples/webshops/us1camera/1_files/box2.jpg +0 -0
  426. data/examples/webshops/us1camera/1_files/box3.jpg +0 -0
  427. data/examples/webshops/us1camera/1_files/box4.jpg +0 -0
  428. data/examples/webshops/us1camera/1_files/dot.jpg +0 -0
  429. data/examples/webshops/us1camera/1_files/eDevix.gif +0 -0
  430. data/examples/webshops/us1camera/1_files/electronics1.jpg +0 -0
  431. data/examples/webshops/us1camera/1_files/getseal +1 -0
  432. data/examples/webshops/us1camera/1_files/pride.jpg +0 -0
  433. data/examples/webshops/us1camera/1_files/search.jpg +0 -0
  434. data/examples/webshops/us1camera/1_files/sidebutton.jpg +0 -0
  435. data/examples/webshops/us1camera/1_files/sslroilogic.js +49 -0
  436. data/examples/webshops/us1camera/1_files/style.css +1 -0
  437. data/examples/webshops/us1camera/1_files/tl.html +2 -0
  438. data/examples/webshops/us1camera/input.html +548 -0
  439. data/examples/webshops/us1camera/test.rb +37 -0
  440. data/lib/scrubyt/core/navigation/agents/firewatir.rb +285 -0
  441. data/lib/scrubyt/core/navigation/agents/mechanize.rb +315 -0
  442. data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
  443. data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
  444. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  445. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  446. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  447. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  448. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  449. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  450. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  451. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  452. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  453. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  454. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  455. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  456. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  457. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  458. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  459. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  460. data/lib/scrubyt/core/shared/extractor.rb +171 -0
  461. data/lib/scrubyt/logging.rb +154 -0
  462. data/lib/scrubyt/output/post_processor.rb +139 -0
  463. data/lib/scrubyt/output/result.rb +44 -0
  464. data/lib/scrubyt/output/result_dumper.rb +154 -0
  465. data/lib/scrubyt/output/result_node.rb +145 -0
  466. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  467. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  468. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  469. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  470. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  471. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  472. data/lib/scrubyt.rb +53 -0
  473. data/pkg/scrubyt-0.4.31.gem +0 -0
  474. data/resources/allison/LICENSE +184 -0
  475. data/resources/allison/README +37 -0
  476. data/resources/allison/allison.css +301 -0
  477. data/resources/allison/allison.gif +0 -0
  478. data/resources/allison/allison.js +307 -0
  479. data/resources/allison/allison.rb +287 -0
  480. data/resources/allison/cache/BODY +588 -0
  481. data/resources/allison/cache/CLASS_INDEX +4 -0
  482. data/resources/allison/cache/CLASS_PAGE +1 -0
  483. data/resources/allison/cache/FILE_INDEX +4 -0
  484. data/resources/allison/cache/FILE_PAGE +1 -0
  485. data/resources/allison/cache/FONTS +1 -0
  486. data/resources/allison/cache/FR_INDEX_BODY +1 -0
  487. data/resources/allison/cache/IMGPATH +1 -0
  488. data/resources/allison/cache/INDEX +1 -0
  489. data/resources/allison/cache/JAVASCRIPT +307 -0
  490. data/resources/allison/cache/METHOD_INDEX +4 -0
  491. data/resources/allison/cache/METHOD_LIST +1 -0
  492. data/resources/allison/cache/SRC_PAGE +1 -0
  493. data/resources/allison/cache/STYLE +323 -0
  494. data/resources/allison/cache/URL +1 -0
  495. data/scrubyt.gemspec +609 -0
  496. data/test/blackbox_test.rb +60 -0
  497. data/test/blackbox_tests/basic/multi_root.expected.xml +8 -0
  498. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  499. data/test/blackbox_tests/basic/simple.expected.xml +5 -0
  500. data/test/blackbox_tests/basic/simple.rb +5 -0
  501. data/test/blackbox_tests/basic/three_divs.html +12 -0
  502. data/test/blackbox_tests/detail_page/detail_page_1.html +7 -0
  503. data/test/blackbox_tests/detail_page/detail_page_2.html +7 -0
  504. data/test/blackbox_tests/detail_page/main_page_1.html +5 -0
  505. data/test/blackbox_tests/detail_page/main_page_2.html +6 -0
  506. data/test/blackbox_tests/detail_page/one_detail_page.expected.xml +7 -0
  507. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  508. data/test/blackbox_tests/detail_page/two_detail_pages.expected.xml +12 -0
  509. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  510. data/test/blackbox_tests/next_page/next_page_link.expected.xml +11 -0
  511. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  512. data/test/blackbox_tests/next_page/page_1.html +11 -0
  513. data/test/blackbox_tests/next_page/page_2.html +11 -0
  514. data/test/blackbox_tests/next_page/page_3.html +7 -0
  515. data/test/blackbox_tests/next_page/page_list_links.expected.xml +11 -0
  516. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  517. data/test/blackbox_tests/next_page/page_list_links.tofix +7 -0
  518. data/todo/backlog.txt +73 -0
  519. data/todo/scenario_ideas.txt +19 -0
  520. metadata +637 -0
@@ -0,0 +1,145 @@
1
+ module Scrubyt
2
+ class ResultNode < Array
3
+ OUTPUT_OPTIONS = [:write_text]
4
+
5
+ attr_accessor :name, :result, :options, :generated_by_leaf
6
+
7
+ def initialize(name, result=nil, options={})
8
+ @name = name
9
+ @result = result
10
+ @options = options
11
+ end
12
+
13
+ def write_text
14
+ @options[:write_text].nil? ? @generated_by_leaf : @options[:write_text]
15
+ end
16
+
17
+ def has_content?
18
+ return true if result.is_a? String
19
+ write_text || (inject(false) { |one_child_has_content, child| one_child_has_content || child.has_content? })
20
+ end
21
+
22
+ def to_s
23
+ return "" if result.nil?
24
+ text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
25
+ text = SharedUtils.unescape_entities(text)
26
+ text.strip!
27
+ if (@options[:default] && ((text == '') || (text == @options[:default])))
28
+ @options[:default]
29
+ else
30
+ text
31
+ end
32
+ end
33
+
34
+ def inspect
35
+ to_s
36
+ end
37
+
38
+ def to_libxml
39
+ libxml_node = XML::Node.new(name)
40
+ self.each { |child| libxml_node << child.to_libxml if child.has_content? }
41
+ libxml_node << to_s if write_text
42
+ libxml_node
43
+ end
44
+
45
+ #note: see ruby_extensions.rb for String#write
46
+ def to_xml
47
+ to_xml_lines.join("\n")
48
+ end
49
+
50
+ def to_hash(delimiter=',')
51
+ result = []
52
+ flat_hash_inner = lambda {|e, hash|
53
+ hash[e.name.to_sym] = hash[e.name.to_sym] ? hash[e.name.to_sym] + delimiter + e.to_s : e.to_s if ((e.write_text && !e.to_s.empty?) || e.options[:default])
54
+ e.each {|c| flat_hash_inner.call(c, hash) }
55
+ hash
56
+ }
57
+ self.each {|e| result << flat_hash_inner.call(e, {}) }
58
+ result
59
+ end
60
+
61
+ def to_flat_hash()
62
+ hash_result = self.to_hash('@@@@@@')
63
+ merged_hash = hash_result.delete_at 0
64
+ hash_result.each do |hash|
65
+ merged_hash.keys.each do |key|
66
+ merged_hash[key] += "@@@@@@#{hash[key]}"
67
+ end
68
+ end
69
+ result_sets = merged_hash.values.map!{|x| x.split('@@@@@@')}.transpose
70
+ final_result = []
71
+
72
+ result_sets.each do |rs|
73
+ temp_result = {}
74
+ merged_hash.keys.each do |k|
75
+ temp_result[k] = rs[merged_hash.keys.index(k)]
76
+ end
77
+ final_result << temp_result
78
+ end
79
+ final_result
80
+ end
81
+
82
+ def to_flat_xml(delimiter=nil)
83
+ lines = []
84
+ hash_result = delimiter ? self.to_hash(delimiter) : self.to_hash
85
+ merged_hash = hash_result.delete_at 0
86
+
87
+ hash_result.each do |hash|
88
+ merged_hash.keys.each do |key|
89
+ merged_hash[key] += "#{delimiter}#{hash[key]}"
90
+ end
91
+ end
92
+
93
+ if delimiter
94
+ result_sets = merged_hash.values.map!{|x| x.split(delimiter)}.transpose
95
+ final_result = []
96
+
97
+ result_sets.each do |rs|
98
+ temp_result = {}
99
+ merged_hash.keys.each do |k|
100
+ temp_result[k] = rs[merged_hash.keys.index(k)]
101
+ end
102
+ final_result << temp_result
103
+ end
104
+ hash_result = final_result
105
+ end
106
+
107
+ hash_result.each do |hash|
108
+ lines << "<item>"
109
+ hash.each do |key, value|
110
+ xml_tag = key.to_s
111
+ value = '' if value == '#empty#'
112
+ lines << " <#{xml_tag}>#{REXML::Text.normalize(value)}</#{xml_tag}>"
113
+ end
114
+ lines << "</item>"
115
+ end
116
+ return lines.join("\n")
117
+
118
+ end
119
+
120
+ def to_xml_lines
121
+ lines = []
122
+ children = self.select{ |child| child.has_content? }
123
+ if children.empty?
124
+ if result.is_a? String
125
+ lines << "<#{name}>#{result}</#{name}>"
126
+ elsif write_text && !to_s.empty?
127
+ lines << "<#{name}>#{ERB::Util.html_escape(to_s)}</#{name}>"
128
+ else
129
+ if @options[:default]
130
+ lines << "<#{name}>#{@options[:default]}</#{name}>"
131
+ else
132
+ lines << "<#{name}/>"
133
+ end
134
+ end
135
+ else
136
+ lines << "<#{name}>"
137
+ lines << " #{ERB::Util.html_escape(to_s)}" if write_text && !to_s.empty?
138
+ children.each do |child|
139
+ lines.push(*child.to_xml_lines.map{ |line| " #{line}" })
140
+ end
141
+ lines << "</#{name}>"
142
+ end
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,42 @@
1
+ module Scrubyt
2
+ class ScrubytResult < ResultNode
3
+ attr_accessor :root_patterns, :source_file, :source_proc
4
+
5
+ def export
6
+ #Temporary solution; the real one will be back later - or not
7
+ result = <<-EXPLANATION
8
+
9
+ === Extractor tree ===
10
+
11
+ export() is not working at the moment, due to the removal or ParseTree, ruby2ruby and RubyInline.
12
+ For now, in case you are using examples, you can replace them by hand based on the output below.
13
+ So if your pattern in the learning extractor looks like
14
+
15
+ book "Ruby Cookbook"
16
+
17
+ and you see the following below:
18
+
19
+ [book] /table[1]/tr/td[2]
20
+
21
+ then replace "Ruby Cookbook" with "/table[1]/tr/td[2]" (and all the other XPaths) and you are ready!
22
+
23
+ EXPLANATION
24
+
25
+ tree_builder = lambda do |node, level|
26
+ result += current_level = (" " * (level == 0 ? 0 : level-1) +
27
+ "|\n" * (level == 0 ? 0 : 1) +
28
+ " " * (level == 0 ? 0 : level-1) +
29
+ "+-- " * (level == 0 ? 0 : 1) +
30
+ "[#{node.name}]")
31
+ result += " #{node.filters[0].xpath}" if node.type == :tree
32
+ result += "\n"
33
+
34
+ node.children.each {|c| tree_builder[c, level+1]}
35
+ end
36
+
37
+ tree_builder[root_patterns[0],0]
38
+
39
+ result += "\n"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,50 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of compund examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by compound examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class CompoundExampleLookup
11
+ def self.find_node_from_compund_example(doc, compound_example, next_link=false, index = 0)
12
+ @partial_results = []
13
+ self.lookup_compound_example(doc, compound_example, index)
14
+ end
15
+
16
+ private
17
+ #Lookup the first element which is matched by this compund example
18
+ #
19
+ #A compound example is specified with :contains, :begins_with and
20
+ #:ends_with descriptors - which can be both regexps or strings
21
+ #
22
+ #Example:
23
+ #
24
+ #flight_info :begins_with => 'Arrival', :contains => /\d\d-\d+/, :ends_with => '20:00'
25
+ def self.lookup_compound_example(doc, compound_example, index)
26
+ compound_example.each do |k,v|
27
+ v = Regexp.escape(v) if v.is_a? String
28
+ case k
29
+ when :contains
30
+ v = /#{v}/
31
+ when :begins_with
32
+ v = /^\s*#{v}/
33
+ when :ends_with
34
+ v = /#{v}\s*$/
35
+ end
36
+ if (@partial_results.empty?)
37
+ @partial_results = SharedUtils.traverse_for_match(doc, v)
38
+ else
39
+ refine_partial_results(v)
40
+ end
41
+ end
42
+ @partial_results[index]
43
+ end
44
+
45
+ def self.refine_partial_results(regexp)
46
+ @partial_results = @partial_results.select {|pr| pr.inner_html.gsub(/<.*?>/, '') =~ regexp}
47
+ end
48
+
49
+ end #End of class CompoundExampleLookup
50
+ end #End of module Scrubyt
@@ -0,0 +1,85 @@
1
+ class Module
2
+ def option_reader(key_default_hash)
3
+ key_default_hash.each do |key, default|
4
+ define_method(key) {
5
+ if @options[key].nil?
6
+ if default.is_a? Proc
7
+ instance_eval(&default)
8
+ else
9
+ default
10
+ end
11
+ else
12
+ @options[key]
13
+ end
14
+ }
15
+ end
16
+ end
17
+
18
+ def option_writer(*keys)
19
+ keys.each do |key|
20
+ define_method("#{key.to_s}=".to_sym) { |value|
21
+ @options[key] = value
22
+ }
23
+ end
24
+ end
25
+
26
+ def option(key, default=nil, writable=false)
27
+ option_reader(key => default)
28
+ option_writer(key) if writable
29
+ end
30
+
31
+ def option_accessor(key_default_hash)
32
+ key_default_hash.each do |key, default|
33
+ option(key, default, true)
34
+ end
35
+ end
36
+ end
37
+
38
+ class Range
39
+ def <=>(other)
40
+ self.begin <=> other.begin
41
+ end
42
+
43
+ def +(amount)
44
+ (self.begin + amount)..(self.end + amount)
45
+ end
46
+
47
+ def -(amount)
48
+ (self.begin - amount)..(self.end - amount)
49
+ end
50
+ end
51
+
52
+ module Math
53
+ def self.min(a, b)
54
+ a < b ? a : b
55
+ end
56
+
57
+ def self.max(a, b)
58
+ a > b ? a : b
59
+ end
60
+ end
61
+
62
+ #dec 16: Dropped - causes some errors w/ Rails
63
+ #just some hack here to allow current examples' syntax:
64
+ #table_data.to_xml.write(open('result.xml', 'w'), 1)
65
+ #class String
66
+ # def write(stringio, add_indent=0)
67
+ # stringio.write((self.split("\n").collect { |line| (' ' * add_indent) + line }).join("\n"))
68
+ # end
69
+ #end
70
+
71
+ #hack to simulate ancestor::tag selector of XPAth
72
+ module Hpricot
73
+ class Elem
74
+ def ancestors(tag = nil)
75
+ element=self
76
+ path=Hpricot::Elements.new
77
+ while element.class != Hpricot::Doc do
78
+ return element if (tag && (tag ==element.name))
79
+ path.push element
80
+ element = element.parent
81
+ end
82
+ path
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,58 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Utilities shared between the other utility classes (XPathUtils, SimpleExampleLookup,...)</tt>
4
+ #
5
+ class SharedUtils
6
+ #Entities to replace - need to make this more complete, or install htmlentities or similar package
7
+ ENTITIES = {
8
+ 'quot' => '"',
9
+ 'apos' => "'",
10
+ 'amp' => '&',
11
+ 'lt' => '<',
12
+ 'gt' => '>',
13
+ 'nbsp' => ' '}
14
+
15
+ def self.prepare_text_for_comparison(text)
16
+ unescape_entities text
17
+ text.strip!
18
+ text
19
+ end
20
+
21
+ #Unescape the entities in the HTML!
22
+ def self.unescape_entities(text)
23
+ ENTITIES.each {|e,s| text.gsub!(/\&#{e};/) {"#{s}"} }
24
+ text
25
+ end
26
+
27
+ #Entry point for finding the elements specified by examples
28
+ def self.traverse_for_match(node, regexp)
29
+ results = []
30
+ traverse_for_match_inner = lambda { |node, regexp|
31
+ ft = prepare_text_for_comparison(node.inner_html.gsub(/<.*?>/, ''))
32
+ if ft =~ regexp
33
+ node.instance_eval do
34
+ @match_data = $~
35
+ def match_data
36
+ @match_data
37
+ end
38
+ end
39
+ results << node
40
+ results.delete node.parent if node.is_a? Hpricot::Elem
41
+ end
42
+ node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
43
+ }
44
+ traverse_for_match_inner.call(node,regexp)
45
+ results
46
+ end
47
+
48
+ def self.get_backtrace
49
+ begin
50
+ raise
51
+ rescue Exception => ex
52
+ backtrace = ex.backtrace
53
+ end
54
+ backtrace.slice!(0)
55
+ backtrace
56
+ end
57
+ end #end of class SharedUtils
58
+ end #end of module Scrubyt
@@ -0,0 +1,40 @@
1
+ module Scrubyt
2
+ #=<tt>Lookup of simple examples</tt>
3
+ #There are two types of string examples in scRUBYt! right now:
4
+ #the simple example and the compound example.
5
+ #
6
+ #This class is responsible for finding elements matched by simple examples.
7
+ #In the futre probably more sophisticated matching algorithms will be added
8
+ #(e.g. match the n-th which matches the text, or element that matches the
9
+ #text but also contains a specific attribute etc.)
10
+ class SimpleExampleLookup
11
+ #From the example text defined by the user, find the lowest possible node which contains the text 'text'.
12
+ #The text can be also a mixed content text, e.g.
13
+ #
14
+ # <a>Bon <b>nuit</b>, monsieur!</a>
15
+ #
16
+ #In this case, <a>'s text is considered to be "Bon nuit, monsieur"
17
+ def self.find_node_from_text(doc, text, next_link=false, index = 0)
18
+ text.gsub!('»', '&#187;')
19
+ #Process immediate attribute extraction (like "go to google.com/@href")
20
+ if text =~ /.+\/@.+$/
21
+ text = text.scan(/^(.+?)\/@.+$/)[0][0]
22
+ elsif text =~ /.+\[\d+\]$/
23
+ res = text.scan(/(.+)\[(\d+)\]$/)
24
+ text = res[0][0]
25
+ index = res[0][1].to_i
26
+ elsif text =~ /.+\[.+\]$/
27
+ final_element_name = text.scan(/^(.+?)\[/)[0][0]
28
+ text = text.scan(/\[(.+?)\]/)[0][0]
29
+ end
30
+ if final_element_name
31
+ text = Regexp.escape(text) if text.is_a? String
32
+ result = SharedUtils.traverse_for_match(doc,/#{text}/)[index]
33
+ result = XPathUtils.traverse_up_until_name(result,final_element_name)
34
+ else
35
+ text = Regexp.escape(text) if text.is_a? String
36
+ result = SharedUtils.traverse_for_match(doc,/^#{text}$/)[index]
37
+ end
38
+ end
39
+ end #End of class SimpleExampleLookup
40
+ end #End of module Scrubyt
@@ -0,0 +1,202 @@
1
+ require 'rubygems'
2
+ require 'hpricot'
3
+
4
+ module Scrubyt
5
+ ##
6
+ #=<tt>Various XPath utility functions</tt>
7
+ class XPathUtils
8
+
9
+ #Find the LCA (Lowest Common Ancestor) of two nodes
10
+ def self.lowest_common_ancestor(node1, node2)
11
+ path1 = traverse_up(node1)
12
+ path2 = traverse_up(node2)
13
+ return node1.parent if path1 == path2
14
+
15
+ closure = nil
16
+ while (!path1.empty? && !path2.empty?)
17
+ closure = path1.pop
18
+ return closure.parent if (closure != path2.pop)
19
+ end
20
+ path1.size > path2.size ? path1.last.parent : path2.last.parent
21
+ end
22
+
23
+ ##
24
+ #Generate XPath for the given node
25
+ #
26
+ #*parameters*
27
+ #
28
+ #_node_ - The node we are looking up the XPath for
29
+ #
30
+ #_stopnode_ - The Xpath generation is stopped and the XPath that
31
+ #was generated so far is returned if this node is reached.
32
+ #
33
+ #_write_indices_ - whether the index inside the parent shuold be
34
+ #added, as in html[1]/body[1]/table[2]/tr[1]/td[8]
35
+ def self.generate_XPath(node, stopnode=nil, write_indices=false)
36
+ path = []
37
+ indices = []
38
+ found = false
39
+ while !node.nil? && node.class != Hpricot::Doc do
40
+ if node == stopnode
41
+ found = true
42
+ break
43
+ end
44
+ path.push node.name
45
+ indices.push find_index(node) if write_indices
46
+ node = node.parent
47
+ end
48
+ #This condition ensures that if there is a stopnode, and we did not found it along the way,
49
+ #we return nil (since the stopnode is not contained in the path at all)
50
+ return nil if stopnode != nil && !found
51
+ result = ""
52
+ if write_indices
53
+ path.reverse.zip(indices.reverse).each { |node,index| result += "#{node}[#{index}]/" }
54
+ else
55
+ path.reverse.each{ |node| result += "#{node}/" }
56
+ end
57
+ "/" + result.chop
58
+ end
59
+
60
+ #Generate an XPath of the node with indices, relatively to the given
61
+ #relative_root.
62
+ #
63
+ #For example if the elem's absolute XPath is /a/b/c,
64
+ #and the relative root's Xpath is a/b, the result of the function will
65
+ #be /c.
66
+ def self.generate_relative_XPath( elem,relative_root )
67
+ return nil if (elem == relative_root)
68
+ generate_XPath(elem, relative_root, true)
69
+ end
70
+
71
+ #Generate a generalized XPath (i.e. without indices) of the node,
72
+ #relatively to the given relative_root.
73
+ #
74
+ #For example if the elem's absolute XPath is /a[1]/b[3]/c[5],
75
+ #and the relative root's Xpath is a[1]/b[3], the result of the function will
76
+ #be /c.
77
+ def self.generate_generalized_relative_XPath( elem,relative_root )
78
+ return nil if (elem == relative_root)
79
+ generate_XPath(elem, relative_root, false)
80
+ end
81
+
82
+ #Find an image based on the src of the example
83
+ #
84
+ #*parameters*
85
+ #
86
+ #_doc_ - The containing document
87
+ #
88
+ #_example_ - The value of the src attribute of the img tag
89
+ #This is convenient, since if the users rigth-clicks an image and
90
+ #copies image location, this string will be copied to the clipboard
91
+ #and thus can be easily pasted as an examle
92
+ #
93
+ #_index_ - there might be more images with the same src on the page -
94
+ #most typically the user will need the 0th - but if this is not the
95
+ #case, there is the possibility to override this
96
+ def self.find_image(doc, example, index=0)
97
+ if example =~ /\.(jpg|png|gif|jpeg)(\[\d+\])$/
98
+ res = example.scan(/(.+)\[(\d+)\]$/)
99
+ example = res[0][0]
100
+ index = res[0][1].to_i
101
+ end
102
+ (doc/"//img[@src='#{example}']")[index]
103
+ end
104
+
105
+ ##
106
+ #Used to find the parent of a node with the given name - for example
107
+ #find the <form> node which is the parent of the <input> node
108
+ def self.traverse_up_until_name(node, name)
109
+ while node.class != Hpricot::Doc do
110
+ #raise "The element is nil! This probably means the widget with the specified name ('#{name}') does not exist" unless node
111
+ return nil unless node
112
+ break if node.name == name
113
+ node = node.parent
114
+ end
115
+ node
116
+ end
117
+
118
+ ##
119
+ #Used when automatically looking up href attributes (for detail or next links)
120
+ #If the detail pattern did not extract a link, we first look up it's
121
+ #children - and if we don't find a link, traverse up
122
+ def self.find_nearest_node_with_attribute(node, attribute)
123
+ @node = nil
124
+ return node if node.is_a? Hpricot::Elem and node[attribute]
125
+ first_child_node_with_attribute(node, attribute)
126
+ first_parent_node_with_attribute(node, attribute) if !@node
127
+ @node
128
+ end
129
+
130
+ ##
131
+ #Generalre relative XPath from two XPaths: a parent one, (which points higher in the tree),
132
+ #and a child one. The result of the method is the relative XPath of the node pointed to
133
+ #by the second XPath to the node pointed to by the firs XPath.
134
+ def self.generate_relative_XPath_from_XPaths(parent_xpath, child_xpath)
135
+ original_child_xpath_parts = child_xpath.split('/').reject{|s|s==""}
136
+ pairs = to_general_XPath(child_xpath).split('/').reject{|s|s==""}.zip to_general_XPath(parent_xpath).split('/').reject{|s|s==""}
137
+ i = 0
138
+ pairs.each_with_index do |pair,index|
139
+ i = index
140
+ break if pair[0] != pair[1]
141
+ end
142
+ "/" + original_child_xpath_parts[i..-1].join('/')
143
+ end
144
+
145
+ def self.to_full_XPath(doc, xpath, generalize)
146
+ elem = doc/xpath
147
+ elem = elem.map[0] if elem.is_a? Hpricot::Elements
148
+ XPathUtils.generate_XPath(elem, nil, generalize)
149
+ end
150
+
151
+ private
152
+ #Find the index of the child inside the parent
153
+ #For example:
154
+ #
155
+ # tr
156
+ # / | \
157
+ # td td td
158
+ # 0 1 2
159
+ #
160
+ #The last row contains the indices of the td's from the
161
+ #tow above.
162
+ #
163
+ #Note that in classic XPath, the indices start with 1 (rather
164
+ #than 0).
165
+ def self.find_index(node)
166
+ c = 0
167
+ node.parent.children.each do |child|
168
+ if child.class == Hpricot::Elem
169
+ c += 1 if (child.name == node.name)
170
+ break if (node == child)
171
+ end
172
+ end
173
+ c
174
+ end
175
+
176
+ def self.traverse_up(node, stopnode=nil)
177
+ path = []
178
+ while node.class != Hpricot::Doc do
179
+ break if node == stopnode
180
+ path.push node
181
+ node = node.parent
182
+ end
183
+ path
184
+ end
185
+
186
+ def self.first_child_node_with_attribute(node, attribute)
187
+ return if !node.instance_of? Hpricot::Elem || @node
188
+ @node = node if node.attributes[attribute]
189
+ node.children.each { |child| first_child_node_with_attribute(child, attribute) }
190
+ end
191
+
192
+ def self.first_parent_node_with_attribute(node, attribute)
193
+ return if !node.instance_of? Hpricot::Elem || @node
194
+ @node = node if node.attributes[attribute]
195
+ first_parent_node_with_attribute(node.parent, attribute)
196
+ end
197
+
198
+ def self.to_general_XPath(xpath)
199
+ xpath.gsub(/\[.+?\]/) {""}
200
+ end #End of method to_general_XPath
201
+ end #End of class XPathUtils
202
+ end #End of module Scrubyt
data/lib/scrubyt.rb ADDED
@@ -0,0 +1,53 @@
1
+ if RUBY_VERSION < '1.9'
2
+ $KCODE = "u"
3
+ require "jcode"
4
+ end
5
+
6
+ #ruby core
7
+ require "open-uri"
8
+ require "erb"
9
+
10
+ #gems
11
+ require "rexml/text"
12
+ require "rubygems"
13
+ require "mechanize"
14
+ require "hpricot"
15
+
16
+ #scrubyt
17
+ require "#{File.dirname(__FILE__)}/scrubyt/logging"
18
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
19
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
20
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
21
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
22
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
23
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
24
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
25
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
26
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
27
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
28
+ require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
29
+ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
+
32
+ # -- Making Firewatir optional --
33
+ begin
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
+ rescue LoadError
36
+ puts "The gem firewatir is not installed, you'll be able to use Mechanize as the agent only"
37
+ end
38
+ # --
39
+
40
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
41
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
42
+ require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
43
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
44
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
45
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
46
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
47
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
48
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
49
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
50
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
51
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
52
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
53
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
Binary file