artext 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dab606c0c96c6939da80ddf9e5ae61ecb905d90f
4
- data.tar.gz: 7df963e4a642b499c776b844e2da9901e58dfefb
3
+ metadata.gz: 379fd2eab84219f4a82393817354930f7e6cbacd
4
+ data.tar.gz: cee44b5160c701f51c1836d6cd0a27d655ecf4bb
5
5
  SHA512:
6
- metadata.gz: 13819faa7e432065a235b69ad2bcd326015716dff27e1bea9b8b7929e3c4b2d0a0ca77e2b16ee1f276714fe07bf066f14b563ad9c1d5e762859c414b7e7991dc
7
- data.tar.gz: 8e8c98d7d52e137272181661a30921ef9d49af6cfd419cdbc9fb78c1b8f484bea879c93a2fac970c3a59d76bc1643df3a6f308b9436a480b05fa313c82605b1f
6
+ metadata.gz: e505ce5c26c90ecd95ffee18656626954276fc5333e607c158b650f2ded704a7b69a2181904555ae106f25e252488ca878b3aa58bccbd7fda21145c013290ac2
7
+ data.tar.gz: a6ffbd0ebe58681e9ecb9e0554e3aa0fc9ff446fe082ce8ab463362e16a83b1d91ddfdc30e6ecb08adf1a1b1e679ee363d2fcebf443b1a27b58fc260aa09d02b
@@ -48,7 +48,7 @@ DEPENDENCIES
48
48
  artext!
49
49
  bundler (~> 1.6)
50
50
  rake (~> 10.0)
51
- rspec
51
+ rspec (~> 3.3)
52
52
 
53
53
  BUNDLED WITH
54
54
  1.10.6
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Artext
2
2
 
3
- TODO: Write a gem description
3
+ Artext is a gem to extract articles from webpages. It removes all advertisement and additional content, and only shows the core content of the article. It can be helpful for applications that show article view.
4
4
 
5
5
  ## Installation
6
6
 
@@ -20,10 +20,43 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- TODO: Write usage instructions here
23
+ ```ruby
24
+ require 'artext'
25
+
26
+ url = "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
27
+
28
+ Artext.extract(url)
29
+ ```
30
+
31
+ ## Response
32
+
33
+ ```ruby
34
+ response =
35
+ {
36
+ :url => "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
37
+ :data => [{
38
+ :image => "OG IMAGE",
39
+ :title => "OG TITLE",
40
+ :tags => ["KEYWORDS AND TAGS"],
41
+ :type => "OG TYPE",
42
+ :favicon => "FAVICON URL",
43
+ :theme => "DOMINANT COLOR IN THE FAVICON",
44
+ }]
45
+ :article => [{
46
+ :body => "SANITIZED HTML OF THE CORE CONTENT",
47
+ :text => "TEXT OF THE CORE CONTENT",
48
+ :images => ["ARRAY OF IMAGE URLS IN THE SANITIZED HTML"],
49
+ :date => "PUBLISHING DATE OF THE ARTICLE",
50
+ :author => ["PUBLISHING AUTHOR(s)"],
51
+ :score => SCORE BETWEEN 0 AND 1 RELATING TO HOW SUCCESSFULLY THE CONTENT WAS EXTRACTED,
52
+ }]
53
+ }
54
+ ```
24
55
 
25
56
  ## Contributing
26
57
 
58
+ Thank you @amitsaxena for your inputs.
59
+
27
60
  1. Fork it ( https://github.com/[my-github-username]/artext/fork )
28
61
  2. Create your feature branch (`git checkout -b my-new-feature`)
29
62
  3. Commit your changes (`git commit -am 'Add some feature'`)
@@ -121,12 +121,11 @@ module Artext
121
121
  if (article.count > 1)
122
122
  article = get_correct_article(article)
123
123
  score = 0.8
124
- end
125
- if (!is_blank?(article))
124
+ end
125
+ if (is_blank?(article))
126
126
  article = find_article(doc)
127
127
  score = 0.6
128
128
  end
129
- score = score - 0.5 if (type != "article")
130
129
  if (is_blank?(article))
131
130
  # image url
132
131
  begin
@@ -139,6 +138,7 @@ module Artext
139
138
  else
140
139
  article = remove_unwanted_items_from(article)
141
140
  article, score = find_relevant(article, score)
141
+ score = score - 0.5 if (type != "article")
142
142
  if (score > 0.9)
143
143
  html, imgs = iteratively_clean(article, "", [], score)
144
144
  else
@@ -163,7 +163,7 @@ module Artext
163
163
  rel = ps
164
164
  end
165
165
  end
166
- if ((last_p > 5 && last_p == max_p) || max_p - total_p < 2)
166
+ if ((last_p > 5 && last_p == max_p) || total_p - max_p < 2)
167
167
  score = 0.95 if (score < 1)
168
168
  break
169
169
  end
@@ -224,7 +224,7 @@ module Artext
224
224
  end
225
225
  elsif (element.name == "h1" || element.name == "h2" || element.name == "h3" || element.name == "h4")
226
226
  tv = "<h2>#{element.text.split.join(" ")}</h2>" if (!is_blank?(element.text.split.join(" ")))
227
- elsif (element.name == "p" || element.name == "div")
227
+ elsif (element.name == "p")
228
228
  p_elem, ti = extractp(element, score)
229
229
  tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
230
230
  images = images + ti if (!is_blank?(ti))
@@ -243,6 +243,16 @@ module Artext
243
243
  elsif (element.name == "ol" || element.name == "ul")
244
244
  tv, ti = listhandle(element)
245
245
  images = images + ti
246
+ elsif (element.name == "div" || element.name == "span")
247
+ html = ""
248
+ imgs = []
249
+ element.children.each do |elem|
250
+ tv, ti = get_element_html(elem, [], score)
251
+ html = html + tv if (!is_blank?(tv))
252
+ imgs = imgs + ti if (!is_blank?(ti))
253
+ end
254
+ tv = html
255
+ images = imgs
246
256
  end
247
257
  return tv, images
248
258
  end
@@ -356,7 +366,7 @@ module Artext
356
366
  end
357
367
 
358
368
  def self.remove_unwanted_items_from(article)
359
- unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form"]
369
+ unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form", "//header", "//footer"]
360
370
  unwanted_elements.each do |elem|
361
371
  article.search("#{elem}").remove
362
372
  end
@@ -1,3 +1,3 @@
1
1
  module Artext
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: artext
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Anindya Mondal
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-12 00:00:00.000000000 Z
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler