kcna 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e8215fbfaf2abdef2e8fd42f3e9b11cce17b762
4
- data.tar.gz: b8214218bb1104dba3529e3c20c610aa1aa72804
3
+ metadata.gz: 66b4f906f215078226818809408690061af3561b
4
+ data.tar.gz: 91857a54a93a46e71648983199f8ce5a49afee4f
5
5
  SHA512:
6
- metadata.gz: 8e1d84e5676bacc5a107cc4ad29680eced2c56a1cf8120bd4a3af8c6783d216a9942d9780c3262c4fdb5c1c11b535af97b85c3d72dd66d91d4a81bcafc2127dc
7
- data.tar.gz: 12a957382d0fcfa5d207d5788fd5af82ab974f34d9a16eacf233b556722fb34e88cb456628e5104ec3fd9332bf010423d6120a3833432478a3d3e4f2b821f179
6
+ metadata.gz: 587f36f1e416aa5374b0d607fecca5a8ffde3b79a9a7551cd1a45e48fe9760b809111bcd76a597342d046890bad02f318462ed193b19eca93342b4b7a9f8ad13
7
+ data.tar.gz: f580279a254c66920f46cf0fd67d034598acf47a4a957979aa9c6d5654da2389b515c77e418163abf5824c3d19dded16ad12d449094465374bd4b91a71529437
data/README.md CHANGED
@@ -1,2 +1,32 @@
1
1
  # kcna.rb
2
+
2
3
  A Ruby gem for kcna.kp(KCNA, Korean Central News Agency)
4
+
5
+ ## Installation
6
+
7
+ Add this line to your Gemfile and execute `bundle`. (recommended way)
8
+
9
+ ```ruby
10
+ gem 'kcna'
11
+ ```
12
+
13
+ You can also install it by `gem install kcna`.
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ require "kcna"
19
+
20
+ kcna = KCNA.new
21
+
22
+ kcna.get_article_list.each do |article|
23
+ content = kcna.get_article(article.id).content
24
+ File.write("/path/to/directory/#{article.id}.txt", content)
25
+ end
26
+ ```
27
+
28
+ See also [RubyDoc](http://www.rubydoc.info/github/hinamiyagk/kcna.rb/master).
29
+
30
+ ## License
31
+
32
+ MIT
@@ -3,7 +3,9 @@ require "kcna/article"
3
3
  require "httpclient"
4
4
  require "date"
5
5
  require "rexml/document"
6
+ require "oga"
6
7
 
8
+ # KCNA provides several methods for accessing KCNA resource.
7
9
  class KCNA
8
10
  KO = "kor"
9
11
  EN = "eng"
@@ -16,19 +18,22 @@ class KCNA
16
18
  @client = HTTPClient.new
17
19
  end
18
20
 
21
+ private def strip_html(string)
22
+ Oga.parse_html(string).children.map(&:text).join
23
+ end
24
+
25
+ # Processes raw article content.
26
+ # This method strips HTML tags and trailing unnecessary strings.
19
27
  def normalize_text(content)
20
- great_leader_pattern = /<nobr><strong><font.*>(.*)<\/font><\/strong><\/nobr>/
21
- patterns = ["\n", "<br>", "&nbsp;", great_leader_pattern]
22
- content.gsub(Regexp.union(patterns)) do |match|
28
+ replaced_content = content.gsub(/\n|<br>|&nbsp;/) do |match|
23
29
  case match
24
30
  when "\n", "&nbsp;"
25
31
  ""
26
32
  when "<br>"
27
33
  "\n"
28
- when great_leader_pattern
29
- $1
30
34
  end
31
35
  end.sub(/(---|‐‐‐)$/, "")
36
+ strip_html(replaced_content)
32
37
  end
33
38
 
34
39
  private def post(path, body, max_redirect = 3)
@@ -45,11 +50,14 @@ class KCNA
45
50
  end
46
51
  end
47
52
 
53
+ # Sets the response language by sending request to kcna.kp.
54
+ # @param lang [String] the language code. One of +KCNA::KO+, +KCNA::EN+, +KCNA::ZH+, +KCNA::RU+, +KCNA::ES+, and +KCNA::JA+.
48
55
  def set_language(lang)
49
56
  data = {
50
57
  article_code: "", article_type_list: "", news_type_code: "", show_what: "", mediaCode: "",
51
58
  lang: lang
52
59
  }
60
+ # Cookie is considered automatically by httpclient
53
61
  post("/kcna.user.home.retrieveHomeInfoList.kcmsf", data)
54
62
  end
55
63
 
@@ -58,6 +66,11 @@ class KCNA
58
66
  post("/kcna.user.article.retrieveArticleInfoFromArticleCode.kcmsf", data).body
59
67
  end
60
68
 
69
+ # Fetches the article by article ID.
70
+ # The content of the article is already processed by {#normalize_text},
71
+ # so you don't have to do it by youself.
72
+ # @param article_id [String] article ID.
73
+ # @return [KCNA::Article] the article data
61
74
  def get_article(article_id)
62
75
  doc = REXML::Document.new(fetch_article(article_id))
63
76
  container = REXML::XPath.first(doc, "//NData")
@@ -74,20 +87,31 @@ class KCNA
74
87
  music_count = REXML::XPath.first(doc, "//fMusicCnt").text.to_i
75
88
 
76
89
  Article.new(
77
- id: article_id, content: content,
90
+ article_id, content: content,
78
91
  date: date,
79
92
  main_title: main_title, sub_title: sub_title, display_title: display_title,
80
93
  movie_count: movie_count, photo_count: photo_count, music_count: music_count
81
94
  )
82
95
  end
83
96
 
84
- private def fetch_article_list(start, news_type, from_date, to_date)
85
- data = { page_start: start, kwDispTitle: "", keyword: "", newsTypeCode: news_type, articleTypeList: "", photoCount: 0, movieCount: 0, kwContent: "", fromDate: from_date, toDate: to_date }
97
+ private def fetch_article_list(start, news_type, from_date, to_date, title_keyword, content_keyword)
98
+ data = { page_start: start, kwDispTitle: title_keyword, keyword: "", newsTypeCode: news_type, articleTypeList: "", photoCount: 0, movieCount: 0, kwContent: content_keyword, fromDate: from_date, toDate: to_date }
86
99
  post("/kcna.user.article.retrieveArticleListForPage.kcmsf", data).body
87
100
  end
88
101
 
89
- def get_article_list(start = 0, news_type: "", from_date: "", to_date: "")
90
- doc = REXML::Document.new(fetch_article_list(start, news_type, from_date, to_date))
102
+ # Fetches a list of articles.
103
+ # @param start [Integer] Index number for pagination.
104
+ # @param news_type [String] news type.
105
+ # @param from_date [Date, String] This method search articles after this date.
106
+ # @param to_date [Date, String] This method search articles before this date.
107
+ # @param title_keyword [String] search keyword for title.
108
+ # @param content_keyword [String] keyword for full-text search of the articles.
109
+ # @return [Array<KCNA::Article>] article list
110
+ def get_article_list(start = 0, news_type: "", from_date: "", to_date: "", title_keyword: "", content_keyword: "")
111
+ from_date = from_date.to_s unless from_date.kind_of?(String)
112
+ to_date = to_date.to_s unless to_date.kind_of?(String)
113
+
114
+ doc = REXML::Document.new(fetch_article_list(start, news_type, from_date, to_date, title_keyword, content_keyword))
91
115
  article_ids = REXML::XPath.match(doc, "//articleCode").map(&:text)
92
116
  disp_titles = REXML::XPath.match(doc, "//dispTitle").map { |node| normalize_text(node.text) }
93
117
  main_titles = REXML::XPath.match(doc, "//mainTitle").map { |node| normalize_text(node.text) }
@@ -103,7 +127,7 @@ class KCNA
103
127
  ).map do |id, disp, main, sub, date, movie, music, photo|
104
128
  date = "2015-04-02" if id == "AR0060168"
105
129
  Article.new(
106
- id: id, date: Date.parse(date),
130
+ id, date: Date.parse(date),
107
131
  display_title: disp, main_title: main, sub_title: sub,
108
132
  movie_count: movie, music_count: music, photo_count: photo
109
133
  )
@@ -1,20 +1,49 @@
1
+ # Represents an article.
1
2
  class KCNA::Article
3
+ # @return [String] ID of the article
2
4
  attr_reader :id
3
5
 
4
6
  def self.attr_reader_hash(key, default = nil)
5
7
  define_method(key, -> { @attrs[key].nil? ? default : @attrs[key] })
6
8
  end
9
+ private_class_method :attr_reader_hash
7
10
 
11
+ # @!attribute [r]
12
+ # @return [Date] date of the article
8
13
  attr_reader_hash :date
14
+ # @!attribute [r]
15
+ # @return [String] content of the article
9
16
  attr_reader_hash :content
17
+ # @!attribute [r]
18
+ # @return [String] main title of the article
10
19
  attr_reader_hash :main_title
20
+ # @!attribute [r]
21
+ # @return [String] subtitle of the article
11
22
  attr_reader_hash :sub_title, ""
23
+ # @!attribute [r]
24
+ # @return [String]
12
25
  attr_reader_hash :display_title
26
+ # @!attribute [r]
27
+ # @return [Integer] the number of movies related to the article
13
28
  attr_reader_hash :movie_count, 0
29
+ # @!attribute [r]
30
+ # @return [Integer] the number of photos related to the article
14
31
  attr_reader_hash :photo_count, 0
32
+ # @!attribute [r]
33
+ # @return [Integer] the number of musics related to the article
15
34
  attr_reader_hash :music_count, 0
16
35
 
17
- def initialize(id:, **attrs)
36
+ # @param [Hash] attrs attributes of article
37
+ # @option attrs [Date] :date date of the article
38
+ # @option attrs [String] :content content of the article
39
+ # @option attrs [String] :main_title main title of the article
40
+ # @option attrs [String] :sub_title subtitle of the article
41
+ # @option attrs [String] :display_title
42
+ # @option attrs [Integer] :movie_count the number of movies related to the article
43
+ # @option attrs [Integer] :photo_count the number of photos related to the article
44
+ # @option attrs [Integer] :music_count the number of musics related to the article
45
+ def initialize(id, **attrs)
46
+ raise "id is not a string" unless id.kind_of?(String)
18
47
  @id = id
19
48
  @attrs = attrs
20
49
  end
@@ -1,3 +1,3 @@
1
1
  class KCNA
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kcna
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hinata
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-10 00:00:00.000000000 Z
11
+ date: 2017-07-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httpclient
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: oga
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.10'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.10'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +94,20 @@ dependencies:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0.10'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
83
111
  description:
84
112
  email:
85
113
  - syobon.hinata.public@gmail.com