kcna 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e8215fbfaf2abdef2e8fd42f3e9b11cce17b762
4
- data.tar.gz: b8214218bb1104dba3529e3c20c610aa1aa72804
3
+ metadata.gz: 66b4f906f215078226818809408690061af3561b
4
+ data.tar.gz: 91857a54a93a46e71648983199f8ce5a49afee4f
5
5
  SHA512:
6
- metadata.gz: 8e1d84e5676bacc5a107cc4ad29680eced2c56a1cf8120bd4a3af8c6783d216a9942d9780c3262c4fdb5c1c11b535af97b85c3d72dd66d91d4a81bcafc2127dc
7
- data.tar.gz: 12a957382d0fcfa5d207d5788fd5af82ab974f34d9a16eacf233b556722fb34e88cb456628e5104ec3fd9332bf010423d6120a3833432478a3d3e4f2b821f179
6
+ metadata.gz: 587f36f1e416aa5374b0d607fecca5a8ffde3b79a9a7551cd1a45e48fe9760b809111bcd76a597342d046890bad02f318462ed193b19eca93342b4b7a9f8ad13
7
+ data.tar.gz: f580279a254c66920f46cf0fd67d034598acf47a4a957979aa9c6d5654da2389b515c77e418163abf5824c3d19dded16ad12d449094465374bd4b91a71529437
data/README.md CHANGED
@@ -1,2 +1,32 @@
1
1
  # kcna.rb
2
+
2
3
  A Ruby gem for kcna.kp(KCNA, Korean Central News Agency)
4
+
5
+ ## Installation
6
+
7
+ Add this line to your Gemfile and execute `bundle`. (recommended way)
8
+
9
+ ```ruby
10
+ gem 'kcna'
11
+ ```
12
+
13
+ You can also install it by `gem install kcna`.
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ require "kcna"
19
+
20
+ kcna = KCNA.new
21
+
22
+ kcna.get_article_list.each do |article|
23
+ content = kcna.get_article(article.id).content
24
+ File.write("/path/to/directory/#{article.id}.txt", content)
25
+ end
26
+ ```
27
+
28
+ See also [RubyDoc](http://www.rubydoc.info/github/hinamiyagk/kcna.rb/master).
29
+
30
+ ## License
31
+
32
+ MIT
@@ -3,7 +3,9 @@ require "kcna/article"
3
3
  require "httpclient"
4
4
  require "date"
5
5
  require "rexml/document"
6
+ require "oga"
6
7
 
8
+ # KCNA provides several methods for accessing KCNA resource.
7
9
  class KCNA
8
10
  KO = "kor"
9
11
  EN = "eng"
@@ -16,19 +18,22 @@ class KCNA
16
18
  @client = HTTPClient.new
17
19
  end
18
20
 
21
+ private def strip_html(string)
22
+ Oga.parse_html(string).children.map(&:text).join
23
+ end
24
+
25
+ # Processes raw article content.
26
+ # This method strips HTML tags and trailing unnecessary strings.
19
27
  def normalize_text(content)
20
- great_leader_pattern = /<nobr><strong><font.*>(.*)<\/font><\/strong><\/nobr>/
21
- patterns = ["\n", "<br>", "&nbsp;", great_leader_pattern]
22
- content.gsub(Regexp.union(patterns)) do |match|
28
+ replaced_content = content.gsub(/\n|<br>|&nbsp;/) do |match|
23
29
  case match
24
30
  when "\n", "&nbsp;"
25
31
  ""
26
32
  when "<br>"
27
33
  "\n"
28
- when great_leader_pattern
29
- $1
30
34
  end
31
35
  end.sub(/(---|‐‐‐)$/, "")
36
+ strip_html(replaced_content)
32
37
  end
33
38
 
34
39
  private def post(path, body, max_redirect = 3)
@@ -45,11 +50,14 @@ class KCNA
45
50
  end
46
51
  end
47
52
 
53
+ # Sets the response language by sending request to kcna.kp.
54
+ # @param lang [String] the language code. One of +KCNA::KO+, +KCNA::EN+, +KCNA::ZH+, +KCNA::RU+, +KCNA::ES+, and +KCNA::JA+.
48
55
  def set_language(lang)
49
56
  data = {
50
57
  article_code: "", article_type_list: "", news_type_code: "", show_what: "", mediaCode: "",
51
58
  lang: lang
52
59
  }
60
+ # Cookie is considered automatically by httpclient
53
61
  post("/kcna.user.home.retrieveHomeInfoList.kcmsf", data)
54
62
  end
55
63
 
@@ -58,6 +66,11 @@ class KCNA
58
66
  post("/kcna.user.article.retrieveArticleInfoFromArticleCode.kcmsf", data).body
59
67
  end
60
68
 
69
+ # Fetches the article by article ID.
70
+ # The content of the article is already processed by {#normalize_text},
71
+ # so you don't have to do it by youself.
72
+ # @param article_id [String] article ID.
73
+ # @return [KCNA::Article] the article data
61
74
  def get_article(article_id)
62
75
  doc = REXML::Document.new(fetch_article(article_id))
63
76
  container = REXML::XPath.first(doc, "//NData")
@@ -74,20 +87,31 @@ class KCNA
74
87
  music_count = REXML::XPath.first(doc, "//fMusicCnt").text.to_i
75
88
 
76
89
  Article.new(
77
- id: article_id, content: content,
90
+ article_id, content: content,
78
91
  date: date,
79
92
  main_title: main_title, sub_title: sub_title, display_title: display_title,
80
93
  movie_count: movie_count, photo_count: photo_count, music_count: music_count
81
94
  )
82
95
  end
83
96
 
84
- private def fetch_article_list(start, news_type, from_date, to_date)
85
- data = { page_start: start, kwDispTitle: "", keyword: "", newsTypeCode: news_type, articleTypeList: "", photoCount: 0, movieCount: 0, kwContent: "", fromDate: from_date, toDate: to_date }
97
+ private def fetch_article_list(start, news_type, from_date, to_date, title_keyword, content_keyword)
98
+ data = { page_start: start, kwDispTitle: title_keyword, keyword: "", newsTypeCode: news_type, articleTypeList: "", photoCount: 0, movieCount: 0, kwContent: content_keyword, fromDate: from_date, toDate: to_date }
86
99
  post("/kcna.user.article.retrieveArticleListForPage.kcmsf", data).body
87
100
  end
88
101
 
89
- def get_article_list(start = 0, news_type: "", from_date: "", to_date: "")
90
- doc = REXML::Document.new(fetch_article_list(start, news_type, from_date, to_date))
102
+ # Fetches a list of articles.
103
+ # @param start [Integer] Index number for pagination.
104
+ # @param news_type [String] news type.
105
+ # @param from_date [Date, String] This method search articles after this date.
106
+ # @param to_date [Date, String] This method search articles before this date.
107
+ # @param title_keyword [String] search keyword for title.
108
+ # @param content_keyword [String] keyword for full-text search of the articles.
109
+ # @return [Array<KCNA::Article>] article list
110
+ def get_article_list(start = 0, news_type: "", from_date: "", to_date: "", title_keyword: "", content_keyword: "")
111
+ from_date = from_date.to_s unless from_date.kind_of?(String)
112
+ to_date = to_date.to_s unless to_date.kind_of?(String)
113
+
114
+ doc = REXML::Document.new(fetch_article_list(start, news_type, from_date, to_date, title_keyword, content_keyword))
91
115
  article_ids = REXML::XPath.match(doc, "//articleCode").map(&:text)
92
116
  disp_titles = REXML::XPath.match(doc, "//dispTitle").map { |node| normalize_text(node.text) }
93
117
  main_titles = REXML::XPath.match(doc, "//mainTitle").map { |node| normalize_text(node.text) }
@@ -103,7 +127,7 @@ class KCNA
103
127
  ).map do |id, disp, main, sub, date, movie, music, photo|
104
128
  date = "2015-04-02" if id == "AR0060168"
105
129
  Article.new(
106
- id: id, date: Date.parse(date),
130
+ id, date: Date.parse(date),
107
131
  display_title: disp, main_title: main, sub_title: sub,
108
132
  movie_count: movie, music_count: music, photo_count: photo
109
133
  )
@@ -1,20 +1,49 @@
1
+ # Represents an article.
1
2
  class KCNA::Article
3
+ # @return [String] ID of the article
2
4
  attr_reader :id
3
5
 
4
6
  def self.attr_reader_hash(key, default = nil)
5
7
  define_method(key, -> { @attrs[key].nil? ? default : @attrs[key] })
6
8
  end
9
+ private_class_method :attr_reader_hash
7
10
 
11
+ # @!attribute [r]
12
+ # @return [Date] date of the article
8
13
  attr_reader_hash :date
14
+ # @!attribute [r]
15
+ # @return [String] content of the article
9
16
  attr_reader_hash :content
17
+ # @!attribute [r]
18
+ # @return [String] main title of the article
10
19
  attr_reader_hash :main_title
20
+ # @!attribute [r]
21
+ # @return [String] subtitle of the article
11
22
  attr_reader_hash :sub_title, ""
23
+ # @!attribute [r]
24
+ # @return [String]
12
25
  attr_reader_hash :display_title
26
+ # @!attribute [r]
27
+ # @return [Integer] the number of movies related to the article
13
28
  attr_reader_hash :movie_count, 0
29
+ # @!attribute [r]
30
+ # @return [Integer] the number of photos related to the article
14
31
  attr_reader_hash :photo_count, 0
32
+ # @!attribute [r]
33
+ # @return [Integer] the number of musics related to the article
15
34
  attr_reader_hash :music_count, 0
16
35
 
17
- def initialize(id:, **attrs)
36
+ # @param [Hash] attrs attributes of article
37
+ # @option attrs [Date] :date date of the article
38
+ # @option attrs [String] :content content of the article
39
+ # @option attrs [String] :main_title main title of the article
40
+ # @option attrs [String] :sub_title subtitle of the article
41
+ # @option attrs [String] :display_title
42
+ # @option attrs [Integer] :movie_count the number of movies related to the article
43
+ # @option attrs [Integer] :photo_count the number of photos related to the article
44
+ # @option attrs [Integer] :music_count the number of musics related to the article
45
+ def initialize(id, **attrs)
46
+ raise "id is not a string" unless id.kind_of?(String)
18
47
  @id = id
19
48
  @attrs = attrs
20
49
  end
@@ -1,3 +1,3 @@
1
1
  class KCNA
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kcna
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hinata
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-10 00:00:00.000000000 Z
11
+ date: 2017-07-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: httpclient
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '2.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: oga
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.10'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.10'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -80,6 +94,20 @@ dependencies:
80
94
  - - "~>"
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0.10'
97
+ - !ruby/object:Gem::Dependency
98
+ name: yard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
83
111
  description:
84
112
  email:
85
113
  - syobon.hinata.public@gmail.com