artext 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +35 -2
- data/lib/artext.rb +16 -6
- data/lib/artext/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 379fd2eab84219f4a82393817354930f7e6cbacd
|
4
|
+
data.tar.gz: cee44b5160c701f51c1836d6cd0a27d655ecf4bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e505ce5c26c90ecd95ffee18656626954276fc5333e607c158b650f2ded704a7b69a2181904555ae106f25e252488ca878b3aa58bccbd7fda21145c013290ac2
|
7
|
+
data.tar.gz: a6ffbd0ebe58681e9ecb9e0554e3aa0fc9ff446fe082ce8ab463362e16a83b1d91ddfdc30e6ecb08adf1a1b1e679ee363d2fcebf443b1a27b58fc260aa09d02b
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Artext
|
2
2
|
|
3
|
-
|
3
|
+
Artext is a gem to extract articles from webpages. It removes all advertisement and additional content, and only shows the core content of the article. It can be helpful for applications that show article view.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,10 +20,43 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
23
|
+
```ruby
|
24
|
+
require 'artext'
|
25
|
+
|
26
|
+
url = "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
|
27
|
+
|
28
|
+
Artext.extract(url)
|
29
|
+
```
|
30
|
+
|
31
|
+
## Response
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
response =
|
35
|
+
{
|
36
|
+
:url => "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
|
37
|
+
:data => [{
|
38
|
+
:image => "OG IMAGE",
|
39
|
+
:title => "OG TITLE",
|
40
|
+
:tags => ["KEYWORDS AND TAGS"],
|
41
|
+
:type => "OG TYPE",
|
42
|
+
:favicon => "FAVICON URL",
|
43
|
+
:theme => "DOMINANT COLOR IN THE FAVICON",
|
44
|
+
}]
|
45
|
+
:article => [{
|
46
|
+
:body => "SANITIZED HTML OF THE CORE CONTENT",
|
47
|
+
:text => "TEXT OF THE CORE CONTENT",
|
48
|
+
:images => ["ARRAY OF IMAGE URLS IN THE SANITIZED HTML"],
|
49
|
+
:date => "PUBLISHING DATE OF THE ARTICLE",
|
50
|
+
:author => ["PUBLISHING AUTHOR(s)"],
|
51
|
+
:score => SCORE BETWEEN 0 AND 1 RELATING TO HOW SUCCESSFULLY THE CONTENT WAS EXTRACTED,
|
52
|
+
}]
|
53
|
+
}
|
54
|
+
```
|
24
55
|
|
25
56
|
## Contributing
|
26
57
|
|
58
|
+
Thank you @amitsaxena for your inputs.
|
59
|
+
|
27
60
|
1. Fork it ( https://github.com/[my-github-username]/artext/fork )
|
28
61
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
62
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
data/lib/artext.rb
CHANGED
@@ -121,12 +121,11 @@ module Artext
|
|
121
121
|
if (article.count > 1)
|
122
122
|
article = get_correct_article(article)
|
123
123
|
score = 0.8
|
124
|
-
end
|
125
|
-
if (
|
124
|
+
end
|
125
|
+
if (is_blank?(article))
|
126
126
|
article = find_article(doc)
|
127
127
|
score = 0.6
|
128
128
|
end
|
129
|
-
score = score - 0.5 if (type != "article")
|
130
129
|
if (is_blank?(article))
|
131
130
|
# image url
|
132
131
|
begin
|
@@ -139,6 +138,7 @@ module Artext
|
|
139
138
|
else
|
140
139
|
article = remove_unwanted_items_from(article)
|
141
140
|
article, score = find_relevant(article, score)
|
141
|
+
score = score - 0.5 if (type != "article")
|
142
142
|
if (score > 0.9)
|
143
143
|
html, imgs = iteratively_clean(article, "", [], score)
|
144
144
|
else
|
@@ -163,7 +163,7 @@ module Artext
|
|
163
163
|
rel = ps
|
164
164
|
end
|
165
165
|
end
|
166
|
-
if ((last_p > 5 && last_p == max_p) ||
|
166
|
+
if ((last_p > 5 && last_p == max_p) || total_p - max_p < 2)
|
167
167
|
score = 0.95 if (score < 1)
|
168
168
|
break
|
169
169
|
end
|
@@ -224,7 +224,7 @@ module Artext
|
|
224
224
|
end
|
225
225
|
elsif (element.name == "h1" || element.name == "h2" || element.name == "h3" || element.name == "h4")
|
226
226
|
tv = "<h2>#{element.text.split.join(" ")}</h2>" if (!is_blank?(element.text.split.join(" ")))
|
227
|
-
elsif (element.name == "p"
|
227
|
+
elsif (element.name == "p")
|
228
228
|
p_elem, ti = extractp(element, score)
|
229
229
|
tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
|
230
230
|
images = images + ti if (!is_blank?(ti))
|
@@ -243,6 +243,16 @@ module Artext
|
|
243
243
|
elsif (element.name == "ol" || element.name == "ul")
|
244
244
|
tv, ti = listhandle(element)
|
245
245
|
images = images + ti
|
246
|
+
elsif (element.name == "div" || element.name == "span")
|
247
|
+
html = ""
|
248
|
+
imgs = []
|
249
|
+
element.children.each do |elem|
|
250
|
+
tv, ti = get_element_html(elem, [], score)
|
251
|
+
html = html + tv if (!is_blank?(tv))
|
252
|
+
imgs = imgs + ti if (!is_blank?(ti))
|
253
|
+
end
|
254
|
+
tv = html
|
255
|
+
images = imgs
|
246
256
|
end
|
247
257
|
return tv, images
|
248
258
|
end
|
@@ -356,7 +366,7 @@ module Artext
|
|
356
366
|
end
|
357
367
|
|
358
368
|
def self.remove_unwanted_items_from(article)
|
359
|
-
unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form"]
|
369
|
+
unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form", "//header", "//footer"]
|
360
370
|
unwanted_elements.each do |elem|
|
361
371
|
article.search("#{elem}").remove
|
362
372
|
end
|
data/lib/artext/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: artext
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Anindya Mondal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|