newscrapi 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.document +5 -0
  2. data/.gitignore +23 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +17 -0
  5. data/Rakefile +56 -0
  6. data/VERSION +1 -0
  7. data/config/content_scrapper.rb +3 -0
  8. data/doc/classes/ContentMapping.html +242 -0
  9. data/doc/classes/ContentMapping.src/M000001.html +18 -0
  10. data/doc/classes/ContentMapping.src/M000002.html +18 -0
  11. data/doc/classes/ContentMapping.src/M000003.html +18 -0
  12. data/doc/classes/ContentMapping.src/M000004.html +19 -0
  13. data/doc/classes/ContentMapping.src/M000005.html +18 -0
  14. data/doc/classes/ContentMapping.src/M000006.html +25 -0
  15. data/doc/classes/ContentScrapper.html +297 -0
  16. data/doc/classes/ContentScrapper.src/M000007.html +18 -0
  17. data/doc/classes/ContentScrapper.src/M000008.html +18 -0
  18. data/doc/classes/ContentScrapper.src/M000009.html +20 -0
  19. data/doc/classes/ContentScrapper.src/M000010.html +20 -0
  20. data/doc/classes/ContentScrapper.src/M000011.html +18 -0
  21. data/doc/classes/ContentScrapper.src/M000012.html +21 -0
  22. data/doc/classes/ContentScrapper.src/M000013.html +21 -0
  23. data/doc/classes/ContentScrapper.src/M000014.html +33 -0
  24. data/doc/classes/ContentScrapper.src/M000015.html +18 -0
  25. data/doc/classes/ContentScrapper.src/M000016.html +18 -0
  26. data/doc/classes/Feedzirra.html +111 -0
  27. data/doc/classes/Feedzirra/FeedEntryUtilities.html +152 -0
  28. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000017.html +18 -0
  29. data/doc/classes/Feedzirra/FeedEntryUtilities.src/M000018.html +18 -0
  30. data/doc/created.rid +1 -0
  31. data/doc/files/lib/content_scrapper/content_mapping_rb.html +108 -0
  32. data/doc/files/lib/content_scrapper/feedzirra_rb.html +115 -0
  33. data/doc/files/lib/content_scrapper_rb.html +112 -0
  34. data/doc/fr_class_index.html +30 -0
  35. data/doc/fr_file_index.html +29 -0
  36. data/doc/fr_method_index.html +44 -0
  37. data/doc/index.html +24 -0
  38. data/doc/rdoc-style.css +208 -0
  39. data/lib/newscrapi.rb +2 -0
  40. data/lib/newscrapi/encoding.rb +44 -0
  41. data/lib/newscrapi/feedzirra.rb +17 -0
  42. data/lib/newscrapi/mapping.rb +50 -0
  43. data/lib/newscrapi/scrapper.rb +129 -0
  44. data/lib/newscrapi/testing.rb +19 -0
  45. data/rails/init.rb +3 -0
  46. data/test/helper.rb +9 -0
  47. data/test/test_encoding.rb +43 -0
  48. data/test/test_mapping.rb +58 -0
  49. data/test/test_pages.rb +69 -0
  50. data/test/test_pages/cdata.html +23 -0
  51. data/test/test_pages/page_without_encoding_meta_tag.html +401 -0
  52. data/test/test_pages/pretty.html +17 -0
  53. data/test/test_pages/pretty_missing_content.html +17 -0
  54. data/test/test_pages/twocontent.html +11 -0
  55. data/test/test_pages/ugly.html +399 -0
  56. data/test/test_pages/utf-8_page.html +405 -0
  57. data/test/test_pages/windows-1250_page.html +460 -0
  58. data/test/test_scrapper.rb +257 -0
  59. metadata +191 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ *.gemspec
2
+
3
+ ## MAC OS
4
+ .DS_Store
5
+
6
+ ## TEXTMATE
7
+ *.tmproj
8
+ tmtags
9
+
10
+ ## EMACS
11
+ *~
12
+ \#*
13
+ .\#*
14
+
15
+ ## VIM
16
+ *.swp
17
+
18
+ ## PROJECT::GENERAL
19
+ coverage
20
+ rdoc
21
+ pkg
22
+
23
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Gyorgy Frivolt
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,17 @@
1
+ = content_scrapper
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Gyorgy Frivolt. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,56 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "newscrapi"
8
+ gem.summary = "Gem for those who want to screen scrap only the content part of web pages, blogs or articles."
9
+ gem.description = "If you want to cut only the content of pages, without any other part (like the menu, header, footer, commercials, etc.), you might find this gem very handy. A DSL is also defined for nifty definitions for your screen scrapping and sanitization."
10
+ gem.email = "gyorgy.frivolt@gmail.com"
11
+ gem.homepage = "http://github.com/fifigyuri/newscrapi"
12
+ gem.authors = ["Gyorgy Frivolt"]
13
+ gem.add_development_dependency 'thoughtbot-shoulda', '>=2.10.2'
14
+ gem.add_development_dependency 'mocha', '>=0.9.8'
15
+
16
+ gem.add_dependency 'nokogiri', '>=1.4.1'
17
+ gem.add_dependency 'rchardet'
18
+ end
19
+ Jeweler::GemcutterTasks.new
20
+ rescue LoadError
21
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
22
+ end
23
+
24
+ require 'rake/testtask'
25
+ Rake::TestTask.new(:test) do |test|
26
+ test.libs << 'lib' << 'test'
27
+ test.pattern = 'test/**/test_*.rb'
28
+ test.verbose = true
29
+ end
30
+
31
+ begin
32
+ require 'rcov/rcovtask'
33
+ Rcov::RcovTask.new do |test|
34
+ test.libs << 'test'
35
+ test.pattern = 'test/**/test_*.rb'
36
+ test.verbose = true
37
+ end
38
+ rescue LoadError
39
+ task :rcov do
40
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
41
+ end
42
+ end
43
+
44
+ task :test => :check_dependencies
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/rdoctask'
49
+ Rake::RDocTask.new do |rdoc|
50
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "content_scrapper #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.11
@@ -0,0 +1,3 @@
1
+
2
+ sanitize_tags ({:elements => ['p','br', 'b', 'em', 'i', 'strong', 'u', 'a', 'h1', 'h2', 'h3', 'li', 'ol', 'ul'], \
3
+ :attributes => { 'a' => ['href'] }})
@@ -0,0 +1,242 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
7
+ <head>
8
+ <title>Class: ContentMapping</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <meta http-equiv="Content-Script-Type" content="text/javascript" />
11
+ <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
12
+ <script type="text/javascript">
13
+ // <![CDATA[
14
+
15
+ function popupCode( url ) {
16
+ window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
17
+ }
18
+
19
+ function toggleCode( id ) {
20
+ if ( document.getElementById )
21
+ elem = document.getElementById( id );
22
+ else if ( document.all )
23
+ elem = eval( "document.all." + id );
24
+ else
25
+ return false;
26
+
27
+ elemStyle = elem.style;
28
+
29
+ if ( elemStyle.display != "block" ) {
30
+ elemStyle.display = "block"
31
+ } else {
32
+ elemStyle.display = "none"
33
+ }
34
+
35
+ return true;
36
+ }
37
+
38
+ // Make codeblocks hidden by default
39
+ document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }</style>" )
40
+
41
+ // ]]>
42
+ </script>
43
+
44
+ </head>
45
+ <body>
46
+
47
+
48
+
49
+ <div id="classHeader">
50
+ <table class="header-table">
51
+ <tr class="top-aligned-row">
52
+ <td><strong>Class</strong></td>
53
+ <td class="class-name-in-header">ContentMapping</td>
54
+ </tr>
55
+ <tr class="top-aligned-row">
56
+ <td><strong>In:</strong></td>
57
+ <td>
58
+ <a href="../files/lib/content_scrapper/content_mapping_rb.html">
59
+ lib/content_scrapper/content_mapping.rb
60
+ </a>
61
+ <br />
62
+ </td>
63
+ </tr>
64
+
65
+ <tr class="top-aligned-row">
66
+ <td><strong>Parent:</strong></td>
67
+ <td>
68
+ Object
69
+ </td>
70
+ </tr>
71
+ </table>
72
+ </div>
73
+ <!-- banner header -->
74
+
75
+ <div id="bodyContent">
76
+
77
+
78
+
79
+ <div id="contextContent">
80
+
81
+
82
+
83
+ </div>
84
+
85
+ <div id="method-list">
86
+ <h3 class="section-bar">Methods</h3>
87
+
88
+ <div class="name-list">
89
+ <a href="#M000003">content_at</a>&nbsp;&nbsp;
90
+ <a href="#M000004">iconv</a>&nbsp;&nbsp;
91
+ <a href="#M000005">matches_url?</a>&nbsp;&nbsp;
92
+ <a href="#M000001">new</a>&nbsp;&nbsp;
93
+ <a href="#M000006">scrap_content</a>&nbsp;&nbsp;
94
+ <a href="#M000002">url_pattern</a>&nbsp;&nbsp;
95
+ </div>
96
+ </div>
97
+
98
+ </div>
99
+
100
+
101
+ <!-- if includes -->
102
+
103
+ <div id="section">
104
+
105
+
106
+
107
+
108
+
109
+ <div id="attribute-list">
110
+ <h3 class="section-bar">Attributes</h3>
111
+
112
+ <div class="name-list">
113
+ <table>
114
+ <tr class="top-aligned-row context-row">
115
+ <td class="context-item-name">content_xpaths_list</td>
116
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
117
+ <td class="context-item-desc"></td>
118
+ </tr>
119
+ <tr class="top-aligned-row context-row">
120
+ <td class="context-item-name">iconv_from</td>
121
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
122
+ <td class="context-item-desc"></td>
123
+ </tr>
124
+ <tr class="top-aligned-row context-row">
125
+ <td class="context-item-name">iconv_to</td>
126
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
127
+ <td class="context-item-desc"></td>
128
+ </tr>
129
+ <tr class="top-aligned-row context-row">
130
+ <td class="context-item-name">url_pattern_regexp</td>
131
+ <td class="context-item-value">&nbsp;[R]&nbsp;</td>
132
+ <td class="context-item-desc"></td>
133
+ </tr>
134
+ </table>
135
+ </div>
136
+ </div>
137
+
138
+
139
+
140
+ <!-- if method_list -->
141
+ <div id="methods">
142
+ <h3 class="section-bar">Public Class methods</h3>
143
+
144
+ <div id="method-M000001" class="method-detail">
145
+ <a name="M000001"></a>
146
+
147
+ <div class="method-heading">
148
+ <a href="ContentMapping.src/M000001.html" target="Code" class="method-signature"
149
+ onclick="popupCode('ContentMapping.src/M000001.html');return false;">
150
+ <span class="method-name">new</span><span class="method-args">()</span>
151
+ </a>
152
+ </div>
153
+
154
+ <div class="method-description">
155
+ </div>
156
+ </div>
157
+
158
+ <h3 class="section-bar">Public Instance methods</h3>
159
+
160
+ <div id="method-M000003" class="method-detail">
161
+ <a name="M000003"></a>
162
+
163
+ <div class="method-heading">
164
+ <a href="ContentMapping.src/M000003.html" target="Code" class="method-signature"
165
+ onclick="popupCode('ContentMapping.src/M000003.html');return false;">
166
+ <span class="method-name">content_at</span><span class="method-args">(content_xpath)</span>
167
+ </a>
168
+ </div>
169
+
170
+ <div class="method-description">
171
+ </div>
172
+ </div>
173
+
174
+ <div id="method-M000004" class="method-detail">
175
+ <a name="M000004"></a>
176
+
177
+ <div class="method-heading">
178
+ <a href="ContentMapping.src/M000004.html" target="Code" class="method-signature"
179
+ onclick="popupCode('ContentMapping.src/M000004.html');return false;">
180
+ <span class="method-name">iconv</span><span class="method-args">(args)</span>
181
+ </a>
182
+ </div>
183
+
184
+ <div class="method-description">
185
+ </div>
186
+ </div>
187
+
188
+ <div id="method-M000005" class="method-detail">
189
+ <a name="M000005"></a>
190
+
191
+ <div class="method-heading">
192
+ <a href="ContentMapping.src/M000005.html" target="Code" class="method-signature"
193
+ onclick="popupCode('ContentMapping.src/M000005.html');return false;">
194
+ <span class="method-name">matches_url?</span><span class="method-args">(url)</span>
195
+ </a>
196
+ </div>
197
+
198
+ <div class="method-description">
199
+ </div>
200
+ </div>
201
+
202
+ <div id="method-M000006" class="method-detail">
203
+ <a name="M000006"></a>
204
+
205
+ <div class="method-heading">
206
+ <a href="ContentMapping.src/M000006.html" target="Code" class="method-signature"
207
+ onclick="popupCode('ContentMapping.src/M000006.html');return false;">
208
+ <span class="method-name">scrap_content</span><span class="method-args">(doc, content_scrapper = nil)</span>
209
+ </a>
210
+ </div>
211
+
212
+ <div class="method-description">
213
+ </div>
214
+ </div>
215
+
216
+ <div id="method-M000002" class="method-detail">
217
+ <a name="M000002"></a>
218
+
219
+ <div class="method-heading">
220
+ <a href="ContentMapping.src/M000002.html" target="Code" class="method-signature"
221
+ onclick="popupCode('ContentMapping.src/M000002.html');return false;">
222
+ <span class="method-name">url_pattern</span><span class="method-args">(pattern)</span>
223
+ </a>
224
+ </div>
225
+
226
+ <div class="method-description">
227
+ </div>
228
+ </div>
229
+
230
+
231
+ </div>
232
+
233
+
234
+ </div>
235
+
236
+
237
+ <div id="validator-badges">
238
+ <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
239
+ </div>
240
+
241
+ </body>
242
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>new (ContentMapping)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/content_scrapper/content_mapping.rb, line 7</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>
15
+ <span class="ruby-ivar">@content_xpaths_list</span> = []
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>url_pattern (ContentMapping)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/content_scrapper/content_mapping.rb, line 11</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">url_pattern</span>(<span class="ruby-identifier">pattern</span>)
15
+ <span class="ruby-ivar">@url_pattern_regexp</span> = <span class="ruby-identifier">pattern</span>
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,18 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>content_at (ContentMapping)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/content_scrapper/content_mapping.rb, line 15</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">content_at</span>(<span class="ruby-identifier">content_xpath</span>)
15
+ <span class="ruby-ivar">@content_xpaths_list</span> <span class="ruby-operator">&lt;&lt;</span> <span class="ruby-identifier">content_xpath</span>
16
+ <span class="ruby-keyword kw">end</span></pre>
17
+ </body>
18
+ </html>
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0" encoding="iso-8859-1"?>
2
+ <!DOCTYPE html
3
+ PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
+
6
+ <html>
7
+ <head>
8
+ <title>iconv (ContentMapping)</title>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
+ <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
11
+ </head>
12
+ <body class="standalone-code">
13
+ <pre><span class="ruby-comment cmt"># File lib/content_scrapper/content_mapping.rb, line 19</span>
14
+ <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">iconv</span>(<span class="ruby-identifier">args</span>)
15
+ <span class="ruby-ivar">@iconv_from</span> = <span class="ruby-identifier">args</span>[<span class="ruby-identifier">:from</span>]
16
+ <span class="ruby-ivar">@iconv_to</span> = <span class="ruby-identifier">args</span>[<span class="ruby-identifier">:to</span>]
17
+ <span class="ruby-keyword kw">end</span></pre>
18
+ </body>
19
+ </html>